In [1]:
import numpy as np
import scipy as sp

from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RF
import xgboost as xgb

data_path = '/home/ubuntu/fs/data/dsb17/'
results_path = data_path+'results/'



## XGBoost

In [2]:
x = np.load(results_path+'dataX.npy')
y = np.load(results_path+'dataY.npy')

trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                               test_size=0.20)

clf = xgb.XGBRegressor(max_depth=10,
                           n_estimators=5000,
                           min_child_weight=9,
                           learning_rate=0.01,
                           nthread=8,
                           subsample=0.80,
                           colsample_bytree=0.80,
                           seed=4242)

clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], eval_metric='logloss', early_stopping_rounds=50)

[0]	validation_0-logloss:0.69007
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.686939
[2]	validation_0-logloss:0.684088
[3]	validation_0-logloss:0.681628
[4]	validation_0-logloss:0.678959
[5]	validation_0-logloss:0.676043
[6]	validation_0-logloss:0.673593
[7]	validation_0-logloss:0.670722
[8]	validation_0-logloss:0.668342
[9]	validation_0-logloss:0.665847
[10]	validation_0-logloss:0.663576
[11]	validation_0-logloss:0.660939
[12]	validation_0-logloss:0.658739
[13]	validation_0-logloss:0.656201
[14]	validation_0-logloss:0.654033
[15]	validation_0-logloss:0.652053
[16]	validation_0-logloss:0.649601
[17]	validation_0-logloss:0.647549
[18]	validation_0-logloss:0.645191
[19]	validation_0-logloss:0.643257
[20]	validation_0-logloss:0.640814
[21]	validation_0-logloss:0.638899
[22]	validation_0-logloss:0.637047
[23]	validation_0-logloss:0.635504
[24]	validation_0-logloss:0.633773
[25]	validation_0-logloss:0.631683
[26]	validation_0-logloss:0.62985

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=10,
       min_child_weight=9, missing=None, n_estimators=5000, nthread=8,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=4242, silent=True, subsample=0.8)

In [3]:
from sklearn.metrics import log_loss
val_y_pred = clf.predict(val_x)

val_y_pred[val_y_pred > 0.85] = 0.85
val_y_pred[val_y_pred < 0.15] = 0.15

print("logloss", log_loss(val_y, val_y_pred))

('logloss', 0.53055496434015892)


## Create Submission

In [4]:
X_test = np.load(results_path+'testX.npy')
X_ids = np.load(results_path+'testId.npy')

y_pred = clf.predict(X_test)

In [5]:
subm = np.stack([X_ids, y_pred], axis=1)
subm_file_name = results_path+'subm5.csv'
np.savetxt(subm_file_name, subm, fmt='%s,%.5f', header='id,cancer', comments='')
print('Saved predictions in {}'.format(subm_file_name))

Saved predictions in /home/ubuntu/fs/data/dsb17/results/subm4.csv


In [6]:
from IPython.display import FileLink
FileLink('../../data/dsb17/results/subm4.csv')