<h2 align="center">Automatic Machine Learning with H2O AutoML</h2>

### Task 2: Importing Packages

In [2]:
import pandas as pd
pd.options.display.max_rows = 999
import numpy as np
import matplotlib.pyplot as plt

### Task 3: Loading and Exploring the Data

In [3]:
xls = pd.ExcelFile('data/bank_term_deposit_marketing_analysis.xlsx')

In [4]:
xls.sheet_names

['PROCEDURE',
 'DATA DESCRIPTION',
 'Step 1 - Collect Information',
 'CLIENT_INFO',
 'LOAN_HISTORY',
 'MARKETING HISTORY',
 'SUBSCRIPTION HISTORY',
 'Step 2 - Merge Information',
 'CLIENT_MERGE',
 'Step 3 - Marketing Analysis',
 'DAILY RANGE',
 'JOB ANALYSIS',
 'Sheet3']

In [5]:
client_info = pd.read_excel(xls, 'CLIENT_INFO')
loan_history = pd.read_excel(xls, 'LOAN_HISTORY')
marketing_history = pd.read_excel(xls, 'MARKETING HISTORY')
subscription_history = pd.read_excel(xls, 'SUBSCRIPTION HISTORY')

In [6]:
client_info.head()

Unnamed: 0,ID,AGE,JOB,MARITAL,EDUCATION
0,2836,58,management,married,tertiary
1,2837,44,technician,single,secondary
2,2838,33,entrepreneur,married,secondary
3,2839,47,blue-collar,married,unknown
4,2840,33,unknown,single,unknown


In [7]:
loan_history.head()

Unnamed: 0,ID,DEFAULT,BALANCE,HOUSING,LOAN
0,2836,no,2143,yes,no
1,2837,no,29,yes,no
2,2838,no,2,yes,yes
3,2839,no,1506,yes,no
4,2840,no,1,no,no


In [8]:
marketing_history.head()

Unnamed: 0,ID,CONTACT,DAY,MONTH,DURATION,CAMPAIGN,PDAYS,PREVIOUS,POUTCOME
0,2836,unknown,5,may,261,1,-1,0,unknown
1,2837,unknown,5,may,151,1,-1,0,unknown
2,2838,unknown,5,may,76,1,-1,0,unknown
3,2839,unknown,5,may,92,1,-1,0,unknown
4,2840,unknown,5,may,198,1,-1,0,unknown


In [9]:
subscription_history.head()

Unnamed: 0,ID,TERM_DEPOSIT
0,2836,no
1,2837,no
2,2838,no
3,2839,no
4,2840,no


In [10]:
df = pd.merge(client_info, loan_history, on=['ID'])
df = pd.merge(df, marketing_history, on=['ID'])
df = pd.merge(df, subscription_history, on=['ID'])
df.head()

Unnamed: 0,ID,AGE,JOB,MARITAL,EDUCATION,DEFAULT,BALANCE,HOUSING,LOAN,CONTACT,DAY,MONTH,DURATION,CAMPAIGN,PDAYS,PREVIOUS,POUTCOME,TERM_DEPOSIT
0,2836,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,2837,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,2838,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,2839,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,2840,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [11]:
df = df.drop(['ID'], axis=1)

### Task 4: Data Prep & Start H2O

In [33]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,30 mins 08 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.3
H2O_cluster_version_age:,6 months and 7 days !!!
H2O_cluster_name:,H2O_from_python_rhyme_whq9my
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.795 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [13]:
h2o_df = h2o.H2OFrame(df)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
h2o_df.describe()

Rows:45211
Cols:17




Unnamed: 0,AGE,JOB,MARITAL,EDUCATION,DEFAULT,BALANCE,HOUSING,LOAN,CONTACT,DAY,MONTH,DURATION,CAMPAIGN,PDAYS,PREVIOUS,POUTCOME,TERM_DEPOSIT
type,int,enum,enum,enum,enum,int,enum,enum,enum,int,enum,int,int,int,int,enum,enum
mins,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
mean,40.93621021432807,,,,,1362.2720576850802,,,,15.806418791886923,,258.16307978146915,2.7638406582468997,40.19782796222158,0.5803233726305561,,
maxs,95.0,,,,,102127.0,,,,31.0,,4918.0,63.0,871.0,275.0,,
sigma,10.618762040975398,,,,,3044.7658291685216,,,,8.32247615304459,,257.52781226517124,3.0980208832791805,100.12874599059822,2.303441044931215,,
zeros,0,,,,,3514,,,,0,,3,0,0,36954,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown,no
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown,no
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown,no


In [15]:
train, test = h2o_df.split_frame(ratios=[.75])
x = train.columns
y = 'TERM_DEPOSIT'
x.remove(y)

### Task 5: Run H2O AutoML

In [28]:
from h2o.automl import H2OAutoML

In [29]:
df.TERM_DEPOSIT.value_counts()

no     39922
yes     5289
Name: TERM_DEPOSIT, dtype: int64

In [30]:
aml = H2OAutoML(max_runtime_secs=600,
               balance_classes=True,
                stopping_metric='logloss' ,
               project_name='Final',
               seed=1)
%time aml.train(x=x, y=y, training_frame=train)

AutoML progress: |
18:14:21.672: New models will be added to existing leaderboard Final@@TERM_DEPOSIT (leaderboard frame=null) with already 28 models.

█████████████████████████████████████████████████████
18:23:54.562: StackedEnsemble_BestOfFamily_AutoML_20201121_181421 [StackedEnsemble best (built using top model from each algorithm type)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.


18:23:55.573: StackedEnsemble_AllModels_AutoML_20201121_181421 [StackedEnsemble all (built using all AutoML models)] failed: water.exceptions.H2OIllegalArgumentException: Failed to find the xval predictions frame. . .  Looks like keep_cross_validation_predictions wasn't set when building the models, or the frame was deleted.

███| 100%
CPU times: user 1min 10s, sys: 1.15 s, total: 1min 11s
Wall time: 9min 35s


### Task 6: AutoML Leaderboard and Ensemble Exploration

In [23]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_grid__1_AutoML_20201121_175520_model_2,0.934792,0.199725,0.6208,0.153067,0.250729,0.0628651
StackedEnsemble_AllModels_AutoML_20201121_175520,0.932008,0.213095,0.617723,0.16631,0.254037,0.0645347
XGBoost_3_AutoML_20201121_175520,0.93199,0.202979,0.620361,0.175289,0.251391,0.0631975
GBM_grid__1_AutoML_20201121_175520_model_1,0.931931,0.202448,0.612292,0.171305,0.252574,0.0637938
XGBoost_grid__1_AutoML_20201121_175520_model_1,0.931752,0.204921,0.607461,0.16005,0.253856,0.0644429
GBM_grid__1_AutoML_20201121_175520_model_2,0.931561,0.209657,0.612522,0.16893,0.257397,0.0662534
XGBoost_grid__1_AutoML_20201121_175520_model_5,0.931091,0.205111,0.615071,0.167683,0.252106,0.0635574
GBM_2_AutoML_20201121_175520,0.928748,0.226669,0.601725,0.164261,0.267424,0.0715155
XGBoost_grid__1_AutoML_20201121_175520_model_4,0.928746,0.215791,0.59357,0.171507,0.259813,0.0675026
GBM_5_AutoML_20201121_175520,0.928223,0.240794,0.58807,0.165032,0.275846,0.0760907




In [31]:
se = aml.leader
metalearner = h2o.get_model(se.metalearner()['name'])

AttributeError: type object 'ModelBase' has no attribute 'metalearner'

In [27]:
metalearner.varimp()

NameError: name 'metalearner' is not defined

### Task 7: Base Learner XGBoost Model Exploration

In [None]:
model = h2o.get_model('XGBoost_grid__1_AutoML_20200605_110413_model_2')