# Primer modelo automático usando H2O

## Importamos las librerías

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import h2o
from h2o.automl import H2OAutoML

## Importamos los datos con pandas

In [2]:
# Cargando los datos
datos_titanic = pd.read_csv('./titanic_train.csv')
entrenamiento, pruebas = train_test_split(datos_titanic,test_size=0.3)

In [3]:
entrenamiento.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,623.0,623.0,623.0,496.0,623.0,623.0,623.0
mean,445.770465,0.373997,2.29695,29.667681,0.510433,0.388443,31.569415
std,257.121139,0.484252,0.844944,14.856592,1.091978,0.825631,47.666306
min,2.0,0.0,1.0,0.67,0.0,0.0,0.0
25%,224.5,0.0,1.0,20.0,0.0,0.0,7.8958
50%,444.0,0.0,3.0,28.0,0.0,0.0,14.5
75%,665.0,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
entrenamiento.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
421,422,0,3,"Charters, Mr. David",male,21.0,0,0,A/5. 13032,7.7333,,Q
223,224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S
189,190,0,3,"Turcin, Mr. Stjepan",male,36.0,0,0,349247,7.8958,,S
558,559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39.0,1,1,110413,79.65,E67,S
237,238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8.0,0,2,C.A. 31921,26.25,,S


## Hacemos una "limpieza" de nuestro datos antes de hacer el modelo

In [5]:
combine = [entrenamiento, pruebas]

In [6]:
# Convert string values 'male' and 'female' to int values
sex_mapping = {'male': 0, 'female': 1}
entrenamiento['Sex'] = entrenamiento['Sex'].map(sex_mapping)
pruebas['Sex'] = pruebas['Sex'].map(sex_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [7]:
calculo_edades = np.zeros((2,3))

In [8]:
for dataset in combine:
    for sex in range(0, 2):
        for pclass in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == sex) & (dataset['Pclass'] == pclass+1)]['Age'].dropna()
            age_guess = guess_df.median()
            calculo_edades[sex, pclass] = int(age_guess/0.5 + 0.5) * 0.5
    
    for sex in range(0, 2):
        for pclass in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == sex) &(dataset.Pclass == pclass+1),'Age'] = calculo_edades[sex, pclass]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [9]:
entrenamiento = entrenamiento.drop(['Ticket', 'Cabin', 'Name', 'PassengerId', 'SibSp', 'Parch', 'Embarked'], axis=1)
pruebas = pruebas.drop(['Ticket', 'Cabin', 'Name', 'SibSp', 'Parch', 'Embarked'], axis=1)


## Creamos y entrenamos nuestro modelo

In [10]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.241-b07, mixed mode)
  Starting server from C:\Users\Administrator\AppData\Roaming\Python\Python37\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ADMINI~1\AppData\Local\Temp\1\tmpz8268nqw
  JVM stdout: C:\Users\ADMINI~1\AppData\Local\Temp\1\tmpz8268nqw\h2o_Administrator_started_from_python.out
  JVM stderr: C:\Users\ADMINI~1\AppData\Local\Temp\1\tmpz8268nqw\h2o_Administrator_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,18 secs
H2O_cluster_timezone:,UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.2
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_Administrator_p2c2o6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.757 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [11]:
entrenamiento = h2o.H2OFrame(entrenamiento)
pruebas = h2o.H2OFrame(pruebas)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [12]:
x = entrenamiento.columns
y = 'Survived'
x.remove(y)

In [13]:
entrenamiento[y] = entrenamiento[y].asfactor()

In [14]:
aml_ti = H2OAutoML(max_runtime_secs= 120,max_models= 10)

In [15]:
aml_ti.train(x = x, y = y,
          training_frame = entrenamiento)

AutoML progress: |
23:16:46.713: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [16]:
lb_ti = aml_ti.leaderboard
lb_ti

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_1_AutoML_20200514_231646,0.865418,0.422783,0.836336,0.187675,0.361024,0.130338
GBM_3_AutoML_20200514_231646,0.863893,0.425542,0.832945,0.185056,0.361435,0.130635
GBM_4_AutoML_20200514_231646,0.861533,0.432837,0.827093,0.180401,0.363912,0.132432
GBM_2_AutoML_20200514_231646,0.8603,0.428529,0.832264,0.186002,0.36182,0.130914
StackedEnsemble_BestOfFamily_AutoML_20200514_231646,0.857918,0.414541,0.827484,0.18001,0.356843,0.127337
GBM_grid__1_AutoML_20200514_231646_model_1,0.856878,0.434578,0.827315,0.194641,0.366922,0.134632
DeepLearning_1_AutoML_20200514_231646,0.855744,0.432237,0.816939,0.196093,0.365037,0.133252
StackedEnsemble_AllModels_AutoML_20200514_231646,0.855233,0.416884,0.825455,0.183438,0.357846,0.128054
GLM_1_AutoML_20200514_231646,0.844404,0.450883,0.792748,0.184637,0.375631,0.141098
DRF_1_AutoML_20200514_231646,0.843931,1.5328,0.773715,0.206069,0.387438,0.150108




In [17]:
h2o.get_model('GBM_1_AutoML_20200514_231646')

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_1_AutoML_20200514_231646


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,37.0,37.0,23022.0,6.0,6.0,6.0,29.0,57.0,44.91892




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.05492856962665486
RMSE: 0.23436844844529492
LogLoss: 0.2128517845020708
Mean Per-Class Error: 0.06560471002531088
AUC: 0.9804721030042919
AUCPR: 0.975846639930719
Gini: 0.9609442060085838

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.41731862755628235: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,379.0,11.0,0.0282,(11.0/390.0)
1,1,24.0,209.0,0.103,(24.0/233.0)
2,Total,403.0,220.0,0.0562,(35.0/623.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.417319,0.922737,158.0
1,max f2,0.231244,0.930521,209.0
2,max f0point5,0.521223,0.952153,144.0
3,max accuracy,0.417319,0.94382,158.0
4,max precision,0.978921,1.0,0.0
5,max recall,0.070316,1.0,336.0
6,max specificity,0.978921,1.0,0.0
7,max absolute_mcc,0.417319,0.879552,158.0
8,max min_per_class_accuracy,0.321242,0.925641,182.0
9,max mean_per_class_accuracy,0.417319,0.934395,158.0



Gains/Lift Table: Avg response rate: 37.40 %, avg score: 37.28 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011236,0.974578,2.67382,2.67382,1.0,0.975997,1.0,0.975997,0.030043,0.030043,167.381974,167.381974
1,,2,0.020867,0.973194,2.67382,2.67382,1.0,0.97386,1.0,0.975011,0.025751,0.055794,167.381974,167.381974
2,,3,0.030498,0.971479,2.67382,2.67382,1.0,0.972412,1.0,0.97419,0.025751,0.081545,167.381974,167.381974
3,,4,0.040128,0.96971,2.67382,2.67382,1.0,0.970864,1.0,0.973392,0.025751,0.107296,167.381974,167.381974
4,,5,0.051364,0.968253,2.67382,2.67382,1.0,0.968923,1.0,0.972414,0.030043,0.137339,167.381974,167.381974
5,,6,0.101124,0.955462,2.67382,2.67382,1.0,0.961817,1.0,0.9672,0.133047,0.270386,167.381974,167.381974
6,,7,0.150883,0.922467,2.67382,2.67382,1.0,0.93769,1.0,0.957468,0.133047,0.403433,167.381974,167.381974
7,,8,0.200642,0.87073,2.67382,2.67382,1.0,0.901335,1.0,0.943547,0.133047,0.536481,167.381974,167.381974
8,,9,0.300161,0.654072,2.587567,2.645223,0.967742,0.761745,0.989305,0.88327,0.257511,0.793991,158.756749,164.522274
9,,10,0.399679,0.292819,1.380036,2.330196,0.516129,0.439815,0.871486,0.772852,0.137339,0.93133,38.0036,133.019632




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.13033844564596098
RMSE: 0.36102416213594485
LogLoss: 0.4227829836608968
Mean Per-Class Error: 0.18767470012105214
AUC: 0.8654176295807198
AUCPR: 0.8363356359956586
Gini: 0.7308352591614395

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3594911669864877: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,334.0,56.0,0.1436,(56.0/390.0)
1,1,54.0,179.0,0.2318,(54.0/233.0)
2,Total,388.0,235.0,0.1766,(110.0/623.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.359491,0.764957,181.0
1,max f2,0.202783,0.805802,241.0
2,max f0point5,0.771677,0.805658,99.0
3,max accuracy,0.610527,0.829856,134.0
4,max precision,0.978851,1.0,0.0
5,max recall,0.026935,1.0,399.0
6,max specificity,0.978851,1.0,0.0
7,max absolute_mcc,0.558145,0.630686,153.0
8,max min_per_class_accuracy,0.280367,0.8,208.0
9,max mean_per_class_accuracy,0.359491,0.812325,181.0



Gains/Lift Table: Avg response rate: 37.40 %, avg score: 37.64 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011236,0.973127,2.67382,2.67382,1.0,0.976555,1.0,0.976555,0.030043,0.030043,167.381974,167.381974
1,,2,0.020867,0.971554,2.67382,2.67382,1.0,0.972205,1.0,0.974547,0.025751,0.055794,167.381974,167.381974
2,,3,0.030498,0.969693,2.67382,2.67382,1.0,0.970846,1.0,0.973378,0.025751,0.081545,167.381974,167.381974
3,,4,0.040128,0.966507,2.67382,2.67382,1.0,0.968096,1.0,0.972111,0.025751,0.107296,167.381974,167.381974
4,,5,0.051364,0.964738,2.291845,2.590263,0.857143,0.965495,0.96875,0.970663,0.025751,0.133047,129.184549,159.026288
5,,6,0.101124,0.945536,2.415063,2.504053,0.903226,0.955911,0.936508,0.963404,0.120172,0.253219,141.506299,150.405341
6,,7,0.150883,0.916201,2.587567,2.531595,0.967742,0.930683,0.946809,0.952613,0.128755,0.381974,158.756749,153.159529
7,,8,0.200642,0.856165,2.070054,2.417133,0.774194,0.885545,0.904,0.93598,0.103004,0.484979,107.005399,141.713305
8,,9,0.300161,0.607627,1.897549,2.244865,0.709677,0.724736,0.839572,0.865942,0.188841,0.67382,89.754949,124.48647
9,,10,0.399679,0.317923,1.078153,1.954358,0.403226,0.456235,0.730924,0.763926,0.107296,0.781116,7.815312,95.435821




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.8427226,0.011759682,0.84,0.832,0.84,0.83870965,0.86290324
1,auc,0.8664121,0.024851287,0.8983075,0.8445617,0.84053385,0.88383025,0.8648272
2,aucpr,0.84148055,0.02738292,0.8537077,0.7940579,0.8421083,0.8599944,0.8575346
3,err,0.15727742,0.011759682,0.16,0.168,0.16,0.16129032,0.13709678
4,err_count,19.6,1.5165751,20.0,21.0,20.0,20.0,17.0
5,f0point5,0.79643625,0.036614254,0.74786323,0.7894737,0.80097085,0.7933579,0.8505155
6,f1,0.78518295,0.017842997,0.7777778,0.7741935,0.76744187,0.8113208,0.79518074
7,f2,0.7766016,0.04118126,0.8101852,0.75949365,0.73660713,0.83011585,0.74660635
8,lift_top_group,2.6849546,0.19799966,2.9761906,2.6041667,2.7173913,2.4313726,2.6956522
9,logloss,0.42273375,0.0433518,0.37495628,0.49084017,0.4330753,0.41048503,0.40431198



See the whole table with table.as_data_frame()

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2020-05-14 23:16:56,2.690 sec,0.0,0.483863,0.661049,0.5,0.373997,1.0,0.626003
1,,2020-05-14 23:16:56,2.719 sec,5.0,0.386208,0.479914,0.944382,0.932306,2.67382,0.093098
2,,2020-05-14 23:16:56,2.742 sec,10.0,0.329949,0.382578,0.95739,0.946868,2.67382,0.089888
3,,2020-05-14 23:16:56,2.758 sec,15.0,0.301374,0.331839,0.961274,0.953742,2.67382,0.086677
4,,2020-05-14 23:16:56,2.770 sec,20.0,0.278844,0.29091,0.966221,0.959682,2.67382,0.077047
5,,2020-05-14 23:16:56,2.782 sec,25.0,0.263327,0.261566,0.971151,0.964821,2.67382,0.072231
6,,2020-05-14 23:16:56,2.802 sec,30.0,0.252281,0.241989,0.972945,0.966995,2.67382,0.070626
7,,2020-05-14 23:16:56,2.823 sec,35.0,0.239348,0.220185,0.977578,0.972976,2.67382,0.0626
8,,2020-05-14 23:16:56,2.829 sec,37.0,0.234368,0.212852,0.980472,0.975847,2.67382,0.05618



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Sex,182.777588,1.0,0.316125
1,Age,170.915268,0.9351,0.295608
2,Fare,135.723389,0.74256,0.234742
3,Pclass,88.765167,0.485646,0.153525




## Predecimos con nuestro árbol y el coeficiente de determinación

In [18]:
Y_pred = aml_ti.leader.predict(pruebas)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [19]:
Y_pred

predict,p0,p1
1,0.0348354,0.965165
0,0.815347,0.184653
0,0.877107,0.122893
0,0.933309,0.0666909
0,0.929689,0.070311
1,0.467821,0.532179
0,0.872299,0.127701
0,0.898222,0.101778
0,0.603652,0.396348
0,0.946203,0.0537973


