In [19]:
import os
import h2o
import pandas as pd
from h2o.automl import H2OAutoML
from sklearn.metrics import mean_squared_error

h2o.init(max_mem_size_GB=32)

BASE_PATH = os.path.join('./datasets')
ETLDATA_PATH = os.path.join(BASE_PATH, 'jinnan')


def get_data():
    train_data = h2o.upload_file('../datasets/jinnan/train_feature.csv')
    
    test_data = h2o.upload_file('../datasets/jinnan/test_feature.csv')

    test_data_sample = h2o.upload_file('../datasets/jinnan/test_submit.csv')

    return train_data, test_data, test_data_sample


def auto_ml():
    train_data, test_data, test_data_sample = get_data()

    remove_columns = ['C1261']
    features_columns = [column for column in train_data.columns if column not in remove_columns]

    test_sample_ids = test_data_sample['C1'].as_data_frame().values.flatten()
    y_labels = train_data['C1261'].as_data_frame().values.flatten()

    aml = H2OAutoML(max_models=15, seed=2019, max_runtime_secs=7200)
    aml.train(x=features_columns, y='C1261', training_frame=train_data)

    lb = aml.leaderboard
    print(lb.head())

    train_data = train_data[features_columns]
    y_train = aml.predict(train_data).as_data_frame().values.flatten()
    mse_error = mean_squared_error(y_labels, y_train)
    print(f'train mse error: {mse_error}')

    test_data = test_data[features_columns]
    automl_predictions = aml.predict(test_data).as_data_frame().values.flatten()

    df = pd.DataFrame({'sample_id': test_sample_ids,
                       'automl_rate': automl_predictions})
    df.to_csv('./submittion_automl.csv', index=False, header=None)

auto_ml()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,8 hours 26 mins
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.1
H2O cluster version age:,22 days
H2O cluster name:,H2O_from_python_zm_5vtixz
H2O cluster total nodes:,1
H2O cluster free memory:,31.96 Gb
H2O cluster total cores:,16
H2O cluster allowed cores:,16


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20190119_210402,0.000123126,0.0110962,0.000123126,0.008209,0.00575758
StackedEnsemble_BestOfFamily_AutoML_20190119_210402,0.00012444,0.0111553,0.00012444,0.00825649,0.00578819
GBM_2_AutoML_20190119_210402,0.000125918,0.0112213,0.000125918,0.00829388,0.00582269
GBM_grid_1_AutoML_20190119_210402_model_3,0.000125969,0.0112236,0.000125969,0.00834105,0.00582363
GBM_3_AutoML_20190119_210402,0.000126391,0.0112424,0.000126391,0.00832262,0.00583268
GBM_4_AutoML_20190119_210402,0.000126753,0.0112585,0.000126753,0.00832732,0.00584088
GBM_1_AutoML_20190119_210402,0.000132827,0.011525,0.000132827,0.00853395,0.0059823
XRT_1_AutoML_20190119_210402,0.000139521,0.0118119,0.000139521,0.00860369,0.00613015
DRF_1_AutoML_20190119_210402,0.000140455,0.0118514,0.000140455,0.00867493,0.00615187
GBM_grid_1_AutoML_20190119_210402_model_2,0.000155885,0.0124854,0.000155885,0.0095073,0.00648334



stackedensemble prediction progress: |████████████████████████████████████| 100%
train mse error: 5.14655512502549e-05
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [20]:
def get_data(name):
    assert name in ['automl', 'tree']

    data_name = f'submittion_{name}.csv'
    columns = ['sample_id', f'{name}_rate']
    df = pd.read_csv(data_name, header=-1, names=columns)

    return df


def blending_main(automl_rate=0.6):
    tree_rate = 1 - automl_rate

    automl_sub = get_data(name='automl')
    automl_name = automl_sub.rename(columns={"sample_id": "automl_rate_1", "automl_rate": "sample_id"})

    tree_sub = get_data(name='tree')

    sub = pd.merge(automl_name, tree_sub, on='sample_id')

    sub['rate'] = sub['automl_rate_1'] * automl_rate + sub['tree_rate'] * tree_rate
    sub = sub.drop(['automl_rate_1', 'tree_rate'], axis=1)
    sub['rate'] = sub['rate'].apply(lambda x:round(x, 3))
    sub.to_csv('submittion.csv', header=False, index=False)

blending_main()