In [1]:
import teleloggingbot
import pprint
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
from scipy.stats import norm, pearsonr, gmean
import scipy as sp

from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.neural_network import MLPClassifier

import xgboost
import lightgbm as lgb

from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam, SGD, RMSprop, Nadam
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 80) 
pd.set_option('display.max_rows', 100) 

Using TensorFlow backend.


In [7]:
def getKerasNN(input_dim):
    model = Sequential()
    model.add(Dense(64, init='he_uniform', input_dim=input_dim))
    model.add(Activation('relu'))
    model.add(Dropout(0.15))
#     model.add(Dense(64))
#     model.add(Dropout(0.45))
    model.add(Activation('relu'))
    model.add(Dense(1, activation='sigmoid', init='he_uniform'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=[])
    return model

In [3]:
dfx_train = pd.read_csv('./x_train_my.csv', index_col=None)
dfy_train = pd.read_csv('./y_train.csv', header=None)
dfx_test =  pd.read_csv('./x_test_my.csv', index_col=None)

In [4]:
%%time
xgb_matrix = xgboost.DMatrix(dfx_train, dfy_train)
xgb_param = {
    'colsample_bylevel': 0.85,
    'colsample_bytree': 0.55,
    'gamma': 3.3,
    'learning_rate': 0.008,
    'max_delta_step': 8.9,
    'max_depth': 4,
    'min_child_weight': 7.0,
    'n_estimators': 4000,
    'nthread': 8,
    'objective': 'binary:logistic',
    'reg_lambda': 9.95,
    'scale_pos_weight': 1.0,
    'silent': 1,
    'subsample': 0.55
}
cvresult = xgboost.cv(xgb_param, xgb_matrix, num_boost_round=xgb_param['n_estimators']+1,
                      nfold=5, verbose_eval=50, stratified=True, early_stopping_rounds=50,
                      metrics='logloss', seed=42,)
print('Done.')
xgb_param['n_estimators'] = len(cvresult)
msg = 'Done XGBOOST CV! logloss: {}\n{}'.format(
    cvresult['test-logloss-mean'].iloc[-1], pprint.pformat(xgb_param, indent=4))
# teleloggingbot.sendMsg(msg) 
print(msg)

[0]	train-logloss:0.689114+1.12677e-05	test-logloss:0.689149+2.2069e-05
[50]	train-logloss:0.546864+0.000458823	test-logloss:0.548208+0.00100929
[100]	train-logloss:0.472392+0.000623091	test-logloss:0.474891+0.00156515
[150]	train-logloss:0.430901+0.000710238	test-logloss:0.434403+0.00201263
[200]	train-logloss:0.407021+0.000796862	test-logloss:0.41143+0.00235295
[250]	train-logloss:0.392986+0.000885477	test-logloss:0.398233+0.00262921
[300]	train-logloss:0.384629+0.000950277	test-logloss:0.390586+0.00288776
[350]	train-logloss:0.379535+0.000970087	test-logloss:0.386182+0.00310733
[400]	train-logloss:0.37637+0.000981235	test-logloss:0.383658+0.00323715
[450]	train-logloss:0.374254+0.000995089	test-logloss:0.382161+0.00344236
[500]	train-logloss:0.372802+0.00101555	test-logloss:0.38136+0.00357555
[550]	train-logloss:0.371694+0.00104418	test-logloss:0.380841+0.00367601
[600]	train-logloss:0.370754+0.00106128	test-logloss:0.380508+0.00371272
[650]	train-logloss:0.369928+0.00104402	test-lo

In [5]:
scaler = MinMaxScaler()
scaler.fit(pd.concat((dfx_train, dfx_test)))
xtrain = scaler.transform(dfx_train)
xtest = scaler.transform(dfx_test)

In [9]:
kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
scores = []
error = np.zeros((len(dfy_train)), dtype=np.float32)

batch_size = 512
early_stop = EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=5,
        verbose=0,
        mode='auto')

for i, (train_index, test_index) in enumerate(kf.split(xtrain, dfy_train.values.ravel())):
    X_train, X_test = xtrain[train_index], xtrain[test_index]
    y_train, y_test = dfy_train.values[train_index], dfy_train.values[test_index]
    model = getKerasNN(dfx_train.shape[1])
    model.fit(X_train, y_train,
              batch_size=batch_size,
              nb_epoch=500,
              verbose=0,
              validation_split=0,
              validation_data=(X_test, y_test),
              callbacks=[early_stop])
    preds = model.predict_proba(X_test, verbose=0)
    error[test_index] = preds
    scores.append(log_loss(y_test, preds))
    print(i+1, scores[-1])
print("MEAN: {:.5} STD: {:.5}".format(np.mean(scores), np.std(scores)))
print('Logloss on whole data', log_loss(dfy_train.values, error.reshape(-1,1)))

1 0.378271050046
2 0.389505603458
3 0.38261223804
4 0.381939568285
5 0.381682689352
MEAN: 0.3828 STD: 0.0036742
Logloss on whole data 0.382802129042


In [10]:
lgb_matrix = lgb.Dataset(dfx_train.values, dfy_train.values.ravel())

In [11]:
lgb_param = {
    'bagging_fraction': 0.55,
    'bagging_freq': 1,
    'boosting': 'gbdt',
    'feature_fraction': 0.75,
    'lambda_l2': 6.05,
    'learning_rate': 0.012,
    'max_bin': 31,
    'max_depth': 6,
    'metric': ('binary_logloss',),
    'num_iterations': 1000,
    'num_leaves': 15,
    'num_threads': 8,
    'objective': 'binary',
    'verbose': 0
}
cvresult = lgb.cv(lgb_param, lgb_matrix, num_boost_round=lgb_param['num_iterations']+1,
                  nfold=4, stratified=True, verbose_eval=50, early_stopping_rounds=50, show_stdv=False, seed=42)
lgb_param['num_iterations'] = len(cvresult['binary_logloss-mean'])
msg = 'Done lightgbm CV! logloss: {} +- {}\n{}'.format(cvresult['binary_logloss-mean'][-1],
                                                       round(cvresult['binary_logloss-stdv'][-1], 8),
                                                       pprint.pformat(lgb_param, indent=4))
# teleloggingbot.sendMsg(msg)
print(msg)

[50]	cv_agg's binary_logloss: 0.50567
[100]	cv_agg's binary_logloss: 0.433182
[150]	cv_agg's binary_logloss: 0.402457
[200]	cv_agg's binary_logloss: 0.38925
[250]	cv_agg's binary_logloss: 0.383523
[300]	cv_agg's binary_logloss: 0.381175
[350]	cv_agg's binary_logloss: 0.380174
[400]	cv_agg's binary_logloss: 0.379842
[450]	cv_agg's binary_logloss: 0.379738
[500]	cv_agg's binary_logloss: 0.379737
Done lightgbm CV! logloss: 0.37966813796021326 +- 0.00208693
{   'bagging_fraction': 0.55,
    'bagging_freq': 1,
    'boosting': 'gbdt',
    'feature_fraction': 0.75,
    'lambda_l2': 6.05,
    'learning_rate': 0.012,
    'max_bin': 255,
    'max_depth': 6,
    'metric': ('binary_logloss',),
    'num_iterations': 484,
    'num_leaves': 15,
    'num_threads': 8,
    'objective': 'binary',
    'verbose': 0}


In [None]:
%%time
lgb_preds = []
for i in tqdm(range(3000)):
    s = i+1
    np.random.seed(s)
    random.seed(s)
    random_param = lgb_param.copy()
    random_param['seed'] = s+1
    random_param['max_depth'] = np.random.choice([5,6,7], p=[0.6,0.2,0.2])
    random_param['num_iterations'] += random.randint(-50,50)
    lgb_clf = lgb.train(random_param, lgb_matrix, num_boost_round=random_param['num_iterations'])
    lgb_preds.append(lgb_clf.predict(dfx_test.values))
    
teleloggingbot.sendMsg('DONE 3000 LGB!')

 11%|█         | 336/3000 [09:55<1:15:53,  1.71s/it]

In [None]:
%%time
xgb_preds = []
for i in tqdm(range(3000)):
    s=i+1
    np.random.seed(s)
    random.seed(s)
    random_param = xgb_param.copy()
    random_param['seed'] = s
    random_param['n_estimators'] += random.randint(-50, 50)
    random_param['max_depth'] = np.random.choice([4,5,6], p=[0.6,0.2,0.2])
    gbm = xgboost.XGBClassifier(**random_param)
    gbm.fit(dfx_train, dfy_train.values.ravel())
    xgb_preds.append(gbm.predict_proba(dfx_test)[:,1])
teleloggingbot.sendMsg('DONE 3000 XGB!')

In [32]:
%%time
mlp_preds = []
for i in tqdm(range(3000)):
    s = i+1
    np.random.seed(s)
    random.seed(s)
    clf = MLPClassifier(hidden_layer_sizes=(64,64),
                        batch_size=64
                        learning_rate_init=0.01,
                        max_iter=200+random.randint(-50,50),
                        random_state=s)
    clf.fit(dfx_train, dfy_train.values.ravel())
    mlp_preds.append(clf.predict_proba(dfx_test)[:,1])
teleloggingbot.sendMsg('DONE 5000 MLP!')

100%|██████████| 3000/3000 [3:18:14<00:00,  3.79s/it]  


CPU times: user 4h 40min 12s, sys: 4h 1min 58s, total: 8h 42min 11s
Wall time: 3h 18min 16s


In [34]:
np.savetxt("ans.csv", gmean([
 gmean(lgb_preds),
 gmean(xgb_preds),
 gmean(mlp_preds)
]), fmt='%10.16f')