In [60]:
import gc
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from xgboost import XGBClassifier
import sklearn.ensemble
from sklearn.metrics import matthews_corrcoef, roc_auc_score
from sklearn.cross_validation import cross_val_score, StratifiedKFold

# 1. Selecting important features

In [92]:

date_chunks = pd.read_csv("./train_date.csv", index_col=0, chunksize=10000, dtype=np.float32)
num_chunks = pd.read_csv("./train_numeric.csv", index_col=0,
                         usecols=list(range(969)), chunksize=10000, dtype=np.float32)

In [93]:
X = pd.concat([pd.concat([dchunk, nchunk], axis=1).sample(frac=0.0.5) for dchunk, nchunk in zip(date_chunks, num_chunks)])

In [94]:
y_all = pd.read_csv("./train_numeric.csv", index_col=0, usecols=[0,969], dtype=np.float32)
y = y_all.loc[X.index].values.ravel()

In [95]:
np.unique(y, return_counts=True)

(array([ 0.,  1.], dtype=float32), array([235391,   1358]))

In [96]:
clf = XGBClassifier(base_score=0.005)
clf.fit(X.values, y)

XGBClassifier(base_score=0.005, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [97]:
important_indices = np.where(clf.feature_importances_>0.007)[0]

In [98]:
important_indices

array([ 393,  440,  805,  883,  939, 1018, 1019, 1029, 1042, 1050, 1056,
       1188, 1271, 1392, 1497, 1501, 1512, 1516, 1548, 1549, 1550, 1847,
       1883, 1893, 1897, 1900, 1907, 1914, 1944, 1955, 1974, 1985, 1992,
       1994, 2004, 2006, 2007, 2010, 2017, 2040])

In [99]:
important_indices.shape

(40,)

In [100]:
# Select cols
n_date_features = 1156

date_cols = np.concatenate([[0], important_indices[important_indices < n_date_features] + 1])
numeric_cols = np.concatenate([[0], important_indices[important_indices >= n_date_features] + 1 - n_date_features])

In [101]:
total_date_features = len(important_indices[important_indices < n_date_features])

In [103]:


X_date = pd.read_csv("./train_date.csv", index_col=0, dtype=np.float32, usecols=date_cols)
# X_date.fillna(0, inplace=True)
X_num = pd.read_csv("./train_numeric.csv", index_col=0, dtype=np.float32, usecols=numeric_cols)
# X_num.fillna(0, inplace=True)
X = pd.concat([X_date, X_num ], axis=1)

del(X_date)
del(X_num)
gc.collect()

1272

In [83]:
y = pd.read_csv("./train_numeric.csv", index_col=0, dtype=np.float32, usecols=[0,969]).values.ravel()

In [84]:
# Normalize date by columns
for i, c in enumerate(X.columns):
    if i + 1 > total_date_features:
        break
    X[c] = X[c]/X[c].max()

In [104]:
X_date_test = pd.read_csv("./test_date.csv", index_col=0, dtype=np.float32, usecols=date_cols)
# X_date_test.fillna(0, inplace=True)
X_num_test = pd.read_csv("./test_numeric.csv", index_col=0, dtype=np.float32, usecols=numeric_cols)
# X_num_test.fillna(0, inplace=True)

X_test = pd.concat([X_date_test, X_num_test ], axis=1)
    
del(X_date_test)
del(X_num_test)
gc.collect()

21

In [86]:
# Normalize date by columns
for i, c in enumerate(X_test.columns):
    if i + 1 > total_date_features:
        break
    X_test[c] = X_test[c]/X[c].max()

# 2. NN

In [18]:
def to_one_hot(y, nlabels=None):
    if nlabels == None:
        nlabels = np.unique(y).__len__()
    return (np.arange(nlabels) == y[:,None]).astype(float)

In [19]:
def wb(wshape=[None], bshape=[None], device='/cpu:0'):
    with tf.device(device):
        w = tf.get_variable("w", wshape, initializer=tf.truncated_normal_initializer(stddev=0.1))
        b = tf.get_variable('b', bshape, initializer=tf.constant_initializer(0.0))
    return w, b

In [20]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [21]:
y_oh = to_one_hot(y)

In [22]:
pos_idx = np.where(y==1)[0]

In [70]:
batch_size = 32
l2_reg_norm = 5e-5
features = X.shape[1]
train_size = X.shape[0]

widest = 512

tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    X_tf = tf.placeholder(tf.float32, shape=(batch_size, features))
    y_tf = tf.placeholder(tf.float32, shape=(batch_size,2))

    with tf.variable_scope("Layer1"):
            layer1_weights, layer1_biases = wb([features, widest], [widest])
    with tf.variable_scope("Layer2"):
            layer2_weights, layer2_biases = wb([widest, widest//4], [widest//4])
    with tf.variable_scope("Layer3"):
            layer3_weights, layer3_biases = wb([widest//4, widest//16], [widest//16])
    with tf.variable_scope("Layer4"):
            layer4_weights, layer4_biases = wb([widest//16, 2], [2])

    def model(data, train=True):
        print("data", data.get_shape())
        
        layer1 = tf.nn.tanh(tf.matmul(data, layer1_weights) + layer1_biases)
        print("layer1", layer1.get_shape())
        if train:
            tf.nn.dropout(layer1, 0.5)
            
            
        layer2 = tf.nn.tanh(tf.matmul(layer1, layer2_weights) + layer2_biases)
        print("layer2", layer2.get_shape())
        if train:
            tf.nn.dropout(layer2, 0.5)
            
        layer3 = tf.nn.tanh(tf.matmul(layer2, layer3_weights) + layer3_biases)
        print("layer3", layer3.get_shape())
        if train:
            tf.nn.dropout(layer3, 0.5)
        
        
        layer4 = tf.nn.tanh(tf.matmul(layer3, layer4_weights) + layer4_biases)
        print("layer4", layer4.get_shape())

        return layer4

    logits = model(X_tf)

    loss_data = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, y_tf))
    regularizers = (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer1_biases) +
                    tf.nn.l2_loss(layer2_weights) + tf.nn.l2_loss(layer2_biases) +
                    tf.nn.l2_loss(layer3_weights) + tf.nn.l2_loss(layer3_biases) +
                    tf.nn.l2_loss(layer4_weights) + tf.nn.l2_loss(layer4_biases))
    loss_l2 = l2_reg_norm * regularizers
    loss = loss_data + loss_l2

    # Optimizer.
    global_step = tf.Variable(0, trainable=False)
    learn_rate  = tf.train.exponential_decay(.0001, global_step*batch_size, train_size//2, 0.5, staircase=True)
    optimizer = tf.train.AdamOptimizer(learn_rate).minimize(loss, global_step=global_step)

    train_prediction = tf.nn.softmax(logits)


data (32, 88)
layer1 (32, 512)
layer2 (32, 128)
layer3 (32, 32)
layer4 (32, 2)


In [66]:
pos_size = 4
batch_size_part = batch_size-pos_size
num_steps = train_size//batch_size_part * 4

print(num_steps)
with tf.Session(graph=graph) as sess:
    init_op = tf.initialize_all_variables()
    saver = tf.train.Saver()
    init_op.run()
    print("Initialized valiables")
    t = datetime.datetime.now()
    for i in range(num_steps):
        offset = (i*batch_size_part) % (X.shape[0] - batch_size_part)
        X_ = X.values[offset:offset+batch_size_part]
        y_ = y_oh[offset:offset+batch_size_part]
        
        offset_pos = (i*pos_size) % (len(pos_idx) - pos_size)
        pos_idx_ = pos_idx[offset_pos:offset_pos+pos_size]
        y__ = to_one_hot(np.array([1]*pos_size), 2)
        X__ = X.iloc[pos_idx_]
        
        X_ = np.concatenate((X_, X__), axis=0)
        y_ = np.concatenate((y_, y__), axis=0)
                                   
        feed_dict = {X_tf : X_, y_tf : y_}
        _, l, ld, pred = sess.run([optimizer, loss, loss_data, train_prediction], feed_dict=feed_dict)
        if (i < 100 and i%10 == 0) or i%10000 == 0:
            
            print(i, l, ld, matthews_corrcoef(np.argmax(y_, 1), np.argmax(pred, 1) ), accuracy(pred, y_), datetime.datetime.now() - t)
            t = datetime.datetime.now()
        if i>0 and i%30000 == 0:
            print(pred)
            save_path = saver.save(sess, "bosch_sel_{}.ckpt".format(i))
    save_path = saver.save(sess, "bosch_sel_end.ckpt")

394580
Initialized valiables
0 0.708609 0.685225 0.0 62.5 0:00:00.028392
10 0.684355 0.661016 -0.1490711985 68.75 0:00:00.032567
20 0.664881 0.641567 0.0 75.0 0:00:00.030993
30 0.679973 0.656679 0.0 75.0 0:00:00.030733
40 0.620125 0.596849 0.0 75.0 0:00:00.029296
50 0.644952 0.621692 0.0 75.0 0:00:00.031232
60 0.615031 0.591785 0.0 75.0 0:00:00.037857
70 0.62802 0.60479 0.0 75.0 0:00:00.038687
80 0.584962 0.561746 0.0 75.0 0:00:00.030472
90 0.614844 0.591645 0.0 75.0 0:00:00.033946
10000 0.558355 0.539945 0.0 75.0 0:00:36.046797
20000 0.518118 0.500101 0.4472135955 81.25 0:00:43.461390
30000 0.284131 0.265621 0.654653670708 87.5 0:00:42.772601
[[ 0.94834739  0.05165254]
 [ 0.94661367  0.0533864 ]
 [ 0.87933207  0.12066793]
 [ 0.88647085  0.1135292 ]
 [ 0.76154464  0.23845528]
 [ 0.92539084  0.07460915]
 [ 0.82630634  0.17369373]
 [ 0.94885856  0.0511414 ]
 [ 0.90955901  0.09044105]
 [ 0.82519352  0.17480646]
 [ 0.84306574  0.15693428]
 [ 0.94136161  0.05863838]
 [ 0.63443494  0.365565 

In [24]:
X_test.shape

(1183748, 50)

In [67]:

ans = np.array([])
with tf.Session(graph = graph) as sess:
    init_op = tf.initialize_all_variables()
    saver = tf.train.Saver()
    init_op.run()
    # Restore variables from disk.
    saver.restore(sess, "bosch_sel_end.ckpt")
    with tf.device("/cpu:0"):
        X_tf_test = tf.placeholder(tf.float32, shape=(None, features))
        test_prediction = tf.nn.softmax(model(X_tf_test, train=False))
    
    ans = sess.run(test_prediction, feed_dict = {X_tf_test:X_test.values})
    

data (?, 88)
layer1 (?, 256)
layer2 (?, 256)
layer3 (?, 128)
layer4 (?, 2)


In [68]:
ans = np.argmax(ans, axis=1)
# 88x128x128x128x2 / .0001 / x4 - 0.00483
# 88x256x256x128x2 / .0001 / x4 - 0.00635
# 50x512x256x128x2 / .0001 / x4 - 0.09744 - clf.feature_importances_>0.007

In [69]:
submission = pd.read_csv("sample_submission.csv")
submission.iloc[:, 1] = ans
submission.to_csv('answer.gb.nn.csv', index=None)

# 3. Gradient Boosting

In [106]:
clf = XGBClassifier(max_depth=5, base_score=0.005)
cv = StratifiedKFold(y, n_folds=3)
preds = np.ones(y.shape[0])
for i, (train, test) in enumerate(cv):
    preds[test] = clf.fit(X.values[train], y[train]).predict_proba(X.values[test])[:,1]
    print("fold {}, ROC AUC: {:.3f}".format(i, roc_auc_score(y[test], preds[test])))
print(roc_auc_score(y, preds))

fold 0, ROC AUC: 0.493
fold 1, ROC AUC: 0.499
fold 2, ROC AUC: 0.502
0.497108472527


In [107]:
# pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 50)
mcc = np.array([matthews_corrcoef(y, preds>thr) for thr in thresholds])
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())

0.000258103875806


In [108]:
ans = (clf.predict_proba(X_test.values)[:,1] > best_threshold).astype(np.int8)

In [109]:
ans.shape

(1183748,)

In [110]:
# Result - 0.20991
submission = pd.read_csv("sample_submission.csv")
submission.iloc[:, 1] = ans
submission.to_csv('answer.gb.csv', index=None)