In [1]:
#!pip install --pre pycaret

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# Imported Libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

In [3]:
train_df = pd.read_csv('../input/sub-pred/train.csv')
test_df = pd.read_csv('../input/sub-pred/test.csv')
sub = pd.read_csv('../input/sub-pred/submission.csv')

train_df.shape, test_df.shape, sub.shape

((21000, 17), (9000, 16), (9000, 1))

In [4]:
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y_bool
0,51,admin.,married,tertiary,no,148,no,no,cellular,15,apr,263,2,-1,0,unknown,0
1,55,blue-collar,married,secondary,no,7160,yes,no,unknown,4,may,315,1,-1,0,unknown,0
2,65,blue-collar,divorced,tertiary,no,2197,yes,no,cellular,12,may,102,2,-1,0,unknown,0
3,25,admin.,married,secondary,no,6658,yes,no,cellular,16,feb,197,1,-1,0,unknown,1
4,36,services,married,secondary,no,1761,yes,no,cellular,19,sep,177,1,-1,0,success,0


In [5]:
# Good No Null Values!
train_df.isnull().sum().max()

0

In [6]:
def preprocess_data(df):
    df.contact = df.contact.map({'cellular': 1, 'telephone': 0, 'unknown': 0}).astype('uint8') 
    df.loan = df.loan.map({'yes': 1, 'no' : 0}).astype('uint8')
    df.housing = df.housing.map({'yes': 1, 'no' : 0}).astype('uint8')
    df.default = df.default.map({'no': 1, 'yes': 0}).astype('uint8')
    df.pdays = df.pdays.replace(-1, 0) # replace with 0 if not contact 
    df.previous = df.previous.apply(lambda x: 1 if x > 0 else 0).astype('uint8') # binary has contact or not
    df.poutcome = df.poutcome.map({'unknown':0, 'failure':0, 'success':1, 'other':1}).astype('uint8')
    
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

In [7]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PowerTransformer,LabelEncoder
for col in ['job', 'marital', 'education', 'month']:
    le = LabelEncoder().fit(train_df[col].append(test_df[col]))
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [8]:
train_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y_bool
0,51,0,1,2,1,148,0,0,1,15,0,263,2,0,0,0,0
1,55,1,1,1,1,7160,1,0,0,4,8,315,1,0,0,0,0
2,65,1,0,2,1,2197,1,0,1,12,8,102,2,0,0,0,0
3,25,0,1,1,1,6658,1,0,1,16,3,197,1,0,0,0,1
4,36,7,1,1,1,1761,1,0,1,19,11,177,1,0,0,1,0


In [9]:
def fet_engg(df):
    df['job+marital'] = df.apply(lambda x: int(x['job']+x['marital']), axis=1)
    df['job+education'] = df.apply(lambda x: int(x['job']+x['education']), axis=1)
    df['job+contact'] = df.apply(lambda x: int(x['job']+x['contact']), axis=1)
    df['job+month'] = df.apply(lambda x: int(x['job']+x['month']), axis=1)
    df['marital+education'] = df.apply(lambda x: int(x['marital']+x['education']), axis=1)
    df['job+marital+education'] = df.apply(lambda x: int(x['job']+x['marital']+x['education']), axis=1)
    df['job+marital+contact'] = df.apply(lambda x: int(x['job']+x['marital']+x['contact']), axis=1)
    df['job+marital+month'] = df.apply(lambda x: int(x['job']+x['marital']+x['month']), axis=1)
    df['job+education+month'] = df.apply(lambda x: int(x['job']+x['education']+x['month']), axis=1)
    df['marital+contact'] = df.apply(lambda x: int(x['marital']+x['contact']), axis=1)
    df['marital+month+education'] = df.apply(lambda x: int(x['marital']+x['month']+x['education']), axis=1)
    df['day_x_month'] = df['day'] * df['month']
    return df

train_df = fet_engg(train_df)
test_df = fet_engg(test_df)

In [10]:
!pip install h2o

[0m

In [11]:
# import packages
import h2o
from h2o.automl import H2OAutoML

## prepare data
h2o.init()

h2o_train = h2o.H2OFrame(train_df)
h2o_test = h2o.H2OFrame(test_df)

h2o_train['y_bool'] = h2o_train['y_bool'].asfactor()

features = [x for x in h2o_train.columns if x != 'y_bool']

model_h2o = H2OAutoML(max_runtime_secs=10*3600, seed=42, balance_classes=True, nfolds=10)
model_h2o.train(x=features, y='y_bool', training_frame=h2o_train)

lb = model_h2o.leaderboard
lb.head(rows=lb.nrows)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16" 2022-07-19; OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpedwo9l4u
  JVM stdout: /tmp/tmpedwo9l4u/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpedwo9l4u/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.1
H2O_cluster_version_age:,2 months and 14 days
H2O_cluster_name:,H2O_from_python_unknownUser_r7de4v
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_grid_1_AutoML_1_20221204_105320_model_68,0.541658,0.579989,0.301019,0.495788,0.442428,0.195742
StackedEnsemble_BestOfFamily_7_AutoML_1_20221204_105320,0.541247,0.579989,0.300866,0.494686,0.442429,0.195744
GBM_grid_1_AutoML_1_20221204_105320_model_47,0.540377,0.58012,0.298472,0.490247,0.4425,0.195806
XGBoost_lr_search_selection_AutoML_1_20221204_105320_select_grid_model_7,0.54035,0.581615,0.300525,0.490842,0.443153,0.196384
GBM_grid_1_AutoML_1_20221204_105320_model_122,0.539234,0.580074,0.299171,0.486512,0.442481,0.19579
GBM_grid_1_AutoML_1_20221204_105320_model_64,0.538972,0.580391,0.298829,0.497606,0.442604,0.195898
StackedEnsemble_BestOfFamily_6_AutoML_1_20221204_105320,0.538865,0.580273,0.299016,0.49334,0.44256,0.19586
GBM_grid_1_AutoML_1_20221204_105320_model_37,0.538747,0.580744,0.294082,0.493801,0.442774,0.196048
StackedEnsemble_BestOfFamily_4_AutoML_1_20221204_105320,0.538468,0.580262,0.298448,0.49578,0.442556,0.195856
GBM_grid_1_AutoML_1_20221204_105320_model_102,0.538409,0.580336,0.298724,0.498502,0.442581,0.195878


In [12]:
preds_h2o = model_h2o.leader.predict(h2o_test).as_data_frame()['p1']
preds_h2o
## create submission
sub['y_bool'] = preds_h2o
sub.to_csv('submission_h2o1.csv', index=False)

preds_h2o

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


0       0.235254
1       0.306534
2       0.241959
3       0.243132
4       0.324635
          ...   
8995    0.241791
8996    0.245858
8997    0.266333
8998    0.237061
8999    0.280445
Name: p1, Length: 9000, dtype: float64