In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
# from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

In [2]:
train = pd.read_csv('jinnan_round1_train_20181227.csv', encoding = 'gb18030')
test  = pd.read_csv('jinnan_round1_testA_20181227.csv', encoding = 'gb18030')

In [3]:
for df in [train, test]:
    df.drop(['B3', 'B13', 'A13', 'A18', 'A23'], axis=1, inplace=True)

In [4]:
good_cols = list(train.columns)
# for col in train.columns:
#     rate = train[col].value_counts(normalize=True, dropna=False).values[0]
#     if rate > 0.9:
#         good_cols.remove(col)
#         print(col,rate)

train = train[train['收率']>0.865]
        
train = train[good_cols]
good_cols.remove('收率')
test  = test[good_cols]
ori_train = train

In [5]:
target = train['收率']
del train['收率']
data = pd.concat([train,test],axis=0,ignore_index=True)
data = data.fillna(-1)

In [6]:
def timeTranSecond(t):
    try:
        t,m,s=t.split(":")
    except:
        if t=='1900/1/9 7:00':
            return 7*3600/3600
        elif t=='1900/1/1 2:30':
            return (2*3600+30*60)/3600
        elif t==-1:
            return -1
        else:
            return 0
    
    try:
        tm = (int(t)*3600+int(m)*60+int(s))/3600
    except:
        return (30*60)/3600
    
    return tm
for f in ['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7']:
    data[f] = data[f].apply(timeTranSecond)

def getDuration(se):
    try:
        sh,sm,eh,em=re.findall(r"\d+\.?\d*",se)
    except:
        if se == -1:
            return -1 
        
    try:
        if int(sh)>int(eh):
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600 + 24
        else:
            tm = (int(eh)*3600+int(em)*60-int(sm)*60-int(sh)*3600)/3600
    except:
        if se=='19:-20:05':
            return 1
        elif se=='15:00-1600':
            return 1
    
    return tm

def get_start(se):
    try:
        sh, sm, eh, em = re.findall(r'\d+\.?d*', se)
    except:
        if se == -1:
            return -1
    try:
        tm = (int(eh) * 3600 + int(em) * 60)/3600
    except:
        if se == '19:-20:05':
            return 19
        elif se == '15:00-1600':
            return 15
    return tm
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f+'_diff'] = data.apply(lambda df: getDuration(df[f]), axis=1)
for f in ['A20','A28','B4','B9','B10','B11']:
    data[f] = data.apply(lambda df: get_start(df[f]), axis=1)

In [7]:
data['样本id'] = data['样本id'].apply(lambda x: int(x.split('_')[1]))

categorical_columns = [f for f in data.columns if f not in ['样本id']]
numerical_columns = [f for f in data.columns if f not in categorical_columns]

In [8]:
for f in categorical_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train.shape[0]]
test  = data[train.shape[0]:]
print(train.shape)
print(test.shape)

(1389, 44)
(150, 44)


In [9]:
train['target'] = target
train['intTarget'] = pd.cut(train['target'], 5, labels=False)
train = pd.get_dummies(train, columns=['intTarget'])
li = ['intTarget_0.0','intTarget_1.0','intTarget_2.0','intTarget_3.0','intTarget_4.0']
mean_columns = []
for f1 in categorical_columns:
    cate_rate = train[f1].value_counts(normalize=True, dropna=False).values[0]
    if cate_rate < 0.90:
        for f2 in li:
            col_name = 'B14_to_'+f1+"_"+f2+'_mean'
            mean_columns.append(col_name)
            order_label = train.groupby([f1])[f2].mean()
            train[col_name] = train['B14'].map(order_label)
            miss_rate = train[col_name].isnull().sum() * 100 / train[col_name].shape[0]
            if miss_rate > 0:
                train = train.drop([col_name], axis=1)
                mean_columns.remove(col_name)
            else:
                test[col_name] = test['B14'].map(order_label)
                
train.drop(li+['target'], axis=1, inplace=True)
print(train.shape)
print(test.shape)

(1389, 149)
(150, 149)


In [10]:
X_train = train[mean_columns+numerical_columns].values
X_test = test[mean_columns+numerical_columns].values

In [11]:
# one hot
enc = OneHotEncoder()
for f in categorical_columns:
    enc.fit(data[f].values.reshape(-1, 1))
    X_train = sparse.hstack((X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
    X_test = sparse.hstack((X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')
print(X_train.shape)
print(X_test.shape)

(1389, 1707)
(150, 1707)


In [12]:
y_train = target.values

In [13]:
import h2o
from h2o.automl import H2OAutoML

In [14]:
h2o.init(max_mem_size='16G')

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_191"; OpenJDK Runtime Environment (build 1.8.0_191-8u191-b12-0ubuntu0.16.04.1-b12); OpenJDK 64-Bit Server VM (build 25.191-b12, mixed mode)
  Starting server from /home/wangruipeng/anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1o7o0xn0
  JVM stdout: /tmp/tmp1o7o0xn0/h2o_wangruipeng_started_from_python.out
  JVM stderr: /tmp/tmp1o7o0xn0/h2o_wangruipeng_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,00 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.1
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_wangruipeng_vhnulu
H2O cluster total nodes:,1
H2O cluster free memory:,14.22 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [15]:
train = pd.DataFrame(X_train.toarray())
train['target'] = y_train

In [16]:
test = pd.DataFrame(X_test.toarray())

In [17]:
train.to_csv('train.csv', encoding='utf-8', index=None, header=False)
test.to_csv('test.csv', encoding='utf-8', index=None,header=False)

In [18]:
train = h2o.upload_file("train.csv")
test = h2o.upload_file("test.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [19]:
feature_name = [i for i in train.columns if i not in ['C1708']]
x = feature_name
y = 'C1708'

In [20]:
aml = H2OAutoML(max_models=20, seed=666, max_runtime_secs=12800)
#aml.train(x=X_train, y=y_train)
aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [21]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_AutoML_20190111_053558,0.000128704,0.0113448,0.000128704,0.0082868,0.00589282
StackedEnsemble_AllModels_AutoML_20190111_053558,0.000129099,0.0113622,0.000129099,0.00827374,0.00590145
XGBoost_3_AutoML_20190111_053558,0.000129562,0.0113825,0.000129562,0.00828565,0.00591162
XGBoost_2_AutoML_20190111_053558,0.000131582,0.0114709,0.000131582,0.00828806,0.00595647
GBM_3_AutoML_20190111_053558,0.000133353,0.0115479,0.000133353,0.00845132,0.00599837
GBM_2_AutoML_20190111_053558,0.000134199,0.0115844,0.000134199,0.00849163,0.00601674
GBM_4_AutoML_20190111_053558,0.000134595,0.0116015,0.000134595,0.0084769,0.00602591
XGBoost_1_AutoML_20190111_053558,0.000135969,0.0116606,0.000135969,0.00852281,0.00605163
GBM_grid_1_AutoML_20190111_053558_model_1,0.000136228,0.0116717,0.000136228,0.00854717,0.00606143
GBM_1_AutoML_20190111_053558,0.000141004,0.0118745,0.000141004,0.00878137,0.00616315




In [22]:
oof_h2o = aml.predict(train[:,:-1]).as_data_frame().values.flatten()

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [23]:
pd.Series(oof_h2o).to_csv('oof_h2o.csv', index=False, header=None)

In [24]:
automl_predictions = aml.predict(test).as_data_frame().values.flatten()

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [25]:
pd.Series(automl_predictions).to_csv('predictions_h2o.csv', index=False, header=None)