#### 导入模型

In [1]:
import numpy as np
import pandas as pd

#### 直接设置训练集格式模型

In [2]:
titanic_train = pd.read_csv("./titanic/titanic_train.txt", header=None, sep="\t")
titanic_test = pd.read_csv("./titanic/titanic_test.txt", header=None, sep="\t")

In [3]:
titanic_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,1,0,1,0,0,0,1,0,1,0,0,1,-0.561363,-0.502445
1,1,1,0,0,1,1,0,0,1,0,1,0,0,0.613182,0.786845
2,1,0,0,1,0,0,0,1,1,0,0,0,1,-0.267727,-0.488854
3,1,1,0,0,1,0,0,1,1,0,1,0,0,0.392955,0.42073
4,0,0,0,1,0,0,0,1,0,1,0,0,1,0.392955,-0.486337


In [4]:
titanic_train.columns=['label']+['col_'+str(i) for i in range(1,15)]
titanic_test.columns=['label']+['col_'+str(i) for i in range(1,15)]

In [5]:
titanic_train.head()

Unnamed: 0,label,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,col_11,col_12,col_13,col_14
0,0,1,0,1,0,0,0,1,0,1,0,0,1,-0.561363,-0.502445
1,1,1,0,0,1,1,0,0,1,0,1,0,0,0.613182,0.786845
2,1,0,0,1,0,0,0,1,1,0,0,0,1,-0.267727,-0.488854
3,1,1,0,0,1,0,0,1,1,0,1,0,0,0.392955,0.42073
4,0,0,0,1,0,0,0,1,0,1,0,0,1,0.392955,-0.486337


In [6]:
titanic_train['label'] =titanic_train['label'].astype(int)

#### 将特征列和label 列筛选出

In [7]:
features=['col_'+str(i) for i in range(1,15)]
y=['label']

#### 初始化 h2o

In [8]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode)
  Starting server from E:\ProgramData\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ADMINI~1\AppData\Local\Temp\tmpll_ng162
  JVM stdout: C:\Users\ADMINI~1\AppData\Local\Temp\tmpll_ng162\h2o_Administrator_started_from_python.out
  JVM stderr: C:\Users\ADMINI~1\AppData\Local\Temp\tmpll_ng162\h2o_Administrator_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,12 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.2
H2O cluster version age:,"2 years, 7 months and 14 days !!!"
H2O cluster name:,H2O_from_python_Administrator_0c7hcb
H2O cluster total nodes:,1
H2O cluster free memory:,12.44 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


#### 将训练集和测试集转成H2OFrame 格式

In [9]:
train = h2o.H2OFrame(titanic_train)
test = h2o.H2OFrame(titanic_test)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


#### 筛选label 列

In [10]:
target='label'

#### 因为是二分类，将目标值转成类别--重要

In [11]:
train[target] = train[target].asfactor()

#### gbm模型

In [22]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
nFolds = 5
sampleRatePerClass = [0.62, 1]

gbm = H2OGradientBoostingEstimator(nfolds = nFolds, fold_assignment = "Modulo",keep_cross_validation_predictions = True,

                                   min_rows = 4, ntrees = 50000, max_depth = 3, learn_rate = 0.01, balance_classes=True,

                                   stopping_metric = 'AUC', stopping_rounds = 100, stopping_tolerance = 1e-4, score_tree_interval = 10, seed = 333)

In [23]:
gbm.train(x = features, y = 'label', training_frame = train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [24]:
gbm.model_performance().auc()

0.9833706805047764

#### 随机森林

In [25]:
from h2o.estimators.random_forest import H2ORandomForestEstimator

rf = H2ORandomForestEstimator(nfolds = nFolds, fold_assignment = "Modulo",keep_cross_validation_predictions = True,

                                   min_rows = 4, ntrees = 50000, max_depth = 6, balance_classes=True,

                                   stopping_metric = 'AUC', stopping_rounds = 100, stopping_tolerance = 1e-4, score_tree_interval = 10, seed = 333)

rf.train(x = features, y = target, training_frame = train)

rf.model_performance().auc()

drf Model Build progress: |███████████████████████████████████████████████| 100%


0.8800549724155102

#### H2O里面的stacking模型

In [26]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

# metaLearnerParams = {'balance_classes': 'True'}

ensemble = H2OStackedEnsembleEstimator(base_models=[gbm, rf],

                                       metalearner_algorithm = 'glm', # metalearner_params = metaLearnerParams,

                                       seed = 333)

ensemble.train(x = features, y = target, training_frame = train)

ensemble.model_performance().auc()

stackedensemble Model Build progress: |███████████████████████████████████| 100%


0.9592214446255285

#### 预测

In [27]:
finalPrediction = ensemble.predict(test[:-1])

stackedensemble prediction progress: |████████████████████████████████████| 100%




In [28]:
finalPrediction[:]

predict,p0,p1
0,0.884052,0.115948
0,0.756909,0.243091
0,0.85773,0.14227
0,0.889607,0.110393
0,0.611123,0.388877
0,0.813694,0.186306
1,0.190374,0.809626
0,0.901115,0.0988851
1,0.218633,0.781367
0,0.903936,0.096064




In [46]:
# submission = test.concat(finalPrediction,axis=1)[['PassengerId','predict']].as_data_frame(use_pandas=True)

# submission.rename(columns={'predict': 'Survived'}, inplace=True)

# submission.to_csv('submission.csv', index = False)