Alexander Guschin, 2nd place in Countable Care: Modeling Women's Health Care Decisions

http://www.drivendata.org/competitions/6/

In [None]:
import pandas as pd
from pandas import DataFrame as df
from pandas import read_csv

from sklearn import preprocessing
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from __future__ import division
import sys
sys.path.append('/home/ubuntu/xgboost/wrapper/')
import xgboost as xgb
from multiprocessing import Pool
import sklearn
from copy import deepcopy
import random

def pool(func, arg, N=15):
    mpool = Pool(N)
    ans = mpool.map(func, arg)
    mpool.terminate()
    return ans

def sigm(x):
    return 1 / (1 + np.exp(-x))

In [4]:
train = read_csv('data/train_values.csv',',',low_memory=False)
test = read_csv('data/test_values.csv',',',low_memory=False)
labels = read_csv('data/train_labels.csv',',')
sampleSubmission = read_csv('data/SubmissionFormat.csv',',')

In [22]:
xcols = list(train.columns)
xcols.remove('id')

ycols = list(labels.columns)
ycols.remove('id')

for c in train.columns:
    if train[c].unique().shape[0] == 1:
        xcols.remove(c)

In [6]:
def log_loss16(y_true, y_pred):
    return log_loss(y_true, y_pred, eps = 1e-16)

logloss16 = sklearn.metrics.make_scorer(log_loss16, needs_proba = True)

def sort_grid_scores(grid_scores_):
    order = [np.arange(len(grid_scores_))[np.argsort([i[1] for i in grid_scores_])]]
    return np.array(gs.grid_scores_)[order]

def print_grid_scores(grid_scorer):
    
    sorted_scores = sort_grid_scores(grid_scorer.grid_scores_)
    
    for params, mean_score, scores in sorted_scores:
        print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

# Preprocessing

In [7]:
train2 = read_csv('data/train2.csv')
test2  = read_csv('data/test2.csv')

In [9]:
trainV = read_csv('data/trainV.csv')
testV  = read_csv('data/testV.csv')

# Feature importances

In [11]:
rf_imp = []
rf = RandomForestClassifier(n_estimators=500, n_jobs=15, criterion='entropy', max_features=100)

imp_cols = xcols[:]

loss = []
for i,c in enumerate(ycols):
    rf.fit(train2[imp_cols], labels[c])
    rf_imp += [rf.feature_importances_]




In [12]:
rf_impV = []
rf = RandomForestClassifier(n_estimators=500, n_jobs=15, criterion='entropy', max_features=100)

loss = []
for i,c in enumerate(ycols):
    rf.fit(trainV, labels[c])
    rf_impV += [rf.feature_importances_]

# Meta-features & Stacking

In [15]:
alg = RandomForestClassifier(n_estimators=100, n_jobs=15, criterion='entropy', max_features=200)

N = 5
ytrain_N = np.zeros((train2.shape[0], 14, N))
ycv_N = np.zeros((test2.shape[0], 14, N))

for nc, c in enumerate(ycols):
    impcols = np.arange(trainV.shape[1])[rf_impV[nc] > 0.0001]    
    
    for i in range(N):
        mask = train_test_split(np.arange(trainV.shape[0]), test_size=.5)
        ansmask = []
        for m1,m2 in [(mask[0],mask[1]), (mask[1],mask[0])]:
            alg.fit(trainV[impcols].values[m1], labels[c].values[m1])
            ytrain_N[m2,nc,i] = alg.predict_proba(trainV[impcols].values[m2])[:,1]
            ansmask += [alg.predict_proba(testV[impcols].values)[:,1]]
        ycv_N[:,nc,i] = sum(ansmask)/2
    
Mtrain = df(ytrain_N.mean(axis=2), columns=ycols)
Mtest = df(ycv_N.mean(axis=2), columns=ycols)




In [16]:
alg = RandomForestClassifier(n_estimators=100, n_jobs=15, criterion='entropy', max_features=300)

N = 5
ytrain_N = np.zeros((train2.shape[0], 14, N))
ycv_N = np.zeros((test2.shape[0], 14, N))

for nc, c in enumerate(ycols):
    impcols = np.arange(trainV.shape[1])[rf_impV[nc] > 0.00001]    
    
    for i in range(N):
        mask = train_test_split(np.arange(trainV.shape[0]), test_size=.5)
        ansmask = []
        for m1,m2 in [(mask[0],mask[1]), (mask[1],mask[0])]:
            alg.fit(trainV[impcols].values[m1], labels[c].values[m1])
            ytrain_N[m2,nc,i] = alg.predict_proba(trainV[impcols].values[m2])[:,1]
            ansmask += [alg.predict_proba(testV[impcols].values)[:,1]]
        ycv_N[:,nc,i] = sum(ansmask)/2
    
Mtrain2 = df(ytrain_N.mean(axis=2), columns=ycols)
Mtest2 = df(ycv_N.mean(axis=2), columns=ycols)

In [17]:
def fast(i):
    random.seed(i)

    mask = train_test_split(np.arange(train2.shape[0]), test_size=.5, random_state=i)
    ansmask = []
    ytrain_i = np.zeros(train2.shape[0])
    for m1,m2 in [(mask[0],mask[1]), (mask[1],mask[0])]:
        
        alg.fit(trainV[impcols].values[m1], labels[c].values[m1])
    
        ytrain_i[m2] = sigm(alg.decision_function(trainV[impcols].values[m2]))
        ansmask += [sigm(alg.decision_function(testV[impcols].values))]
    return ytrain_i, sum(ansmask)/2

In [20]:
alg = sklearn.linear_model.LogisticRegression(C=0.01)

N = 10
ytrain_N = np.zeros((trainV.shape[0], 14))
ycv_N = np.zeros((testV.shape[0], 14))

for nc, c in enumerate(ycols):
    impcols = np.array(trainV.columns)[rf_impV[nc] > 0.0001]
    
    ans = pool(fast,range(N))
    ytrain_N[:,nc] = sum([i[0] for i in ans]) / N
    ycv_N[:,nc] = sum([i[1] for i in ans]) / N
    
MtrainLin = df(ytrain_N, columns=ycols)
MtestLin = df(ycv_N, columns=ycols)

In [25]:
alg = sklearn.svm.LinearSVC()

N = 10
ytrain_N = np.zeros((trainV.shape[0], 14))
ycv_N = np.zeros((testV.shape[0], 14))

for nc, c in enumerate(ycols):
    impcols = np.array(trainV.columns)[rf_impV[nc] > 0.0001]
    
    ans = pool(fast,range(N))
    ytrain_N[:,nc] = sum([i[0] for i in ans]) / N
    ycv_N[:,nc] = sum([i[1] for i in ans]) / N
    
MtrainLin2 = df(ytrain_N, columns=ycols)
MtestLin2 = df(ycv_N, columns=ycols)

In [None]:
alg = sklearn.linear_model.PassiveAggressiveClassifier()

N = 10
ytrain_N = np.zeros((trainV.shape[0], 14))
ycv_N = np.zeros((testV.shape[0], 14))

for nc, c in enumerate(ycols):
    impcols = np.array(trainV.columns)[rf_impV[nc] > 0.0001]
    
    ans = pool(fast,range(N))
    ytrain_N[:,nc] = sum([i[0] for i in ans]) / N
    ycv_N[:,nc] = sum([i[1] for i in ans]) / N
    
MtrainLin4 = df(ytrain_N, columns=ycols)
MtestLin4 = df(ycv_N, columns=ycols)

In [None]:
param = {}
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['scale_pos_weight'] = 1.0
param['bst:eta'] = 0.25
param['bst:max_depth'] = 5
param['bst:colsample_bytree'] = 0.25
param['silent'] = 1
param['nthread'] = 15

num_round = 120
plst = list(param.items())

N = 5
ytrain_N = np.zeros((trainV.shape[0], 14, N))
ycv_N = np.zeros((testV.shape[0], 14, N))

for nc, c in enumerate(ycols):
    impcols = np.arange(trainV.shape[1])[rf_impV[nc] > 0.0001]
    
    Gcv = xgb.DMatrix( testV.values, missing = -1 )
    
    for i in range(N):
        mask = train_test_split(np.arange(trainV.shape[0]), test_size=.5)
        m1, m2 = mask[0], mask[1]
        
        Gtrain1 = xgb.DMatrix( trainV.values[m1], label = labels[c].values[m1], missing = -1 )
        Gtrain2 = xgb.DMatrix( trainV.values[m2], label = labels[c].values[m2], missing = -1 )
        
        ansmask = []
        for t1,t2,m in [(Gtrain1, Gtrain2, m2), (Gtrain2, Gtrain1, m1)]:
            
            bst = xgb.train( plst, t1, num_round )#, watchlist )
            ytrain_N[m,nc,i] = bst.predict( t2 )
            ansmask += [bst.predict( Gcv )]        
            
        ycv_N[:,nc,i] = sum(ansmask)/2

MtrainX = df(ytrain_N.mean(axis=2), columns=ycols)
MtestX = df(ycv_N.mean(axis=2), columns=ycols)

# submits

In [None]:
submission = sampleSubmission.copy()

param = {}
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['scale_pos_weight'] = 1
param['bst:eta'] = 0.2
param['bst:max_depth'] = 2
param['bst:colsample_bytree'] = 1
param['bst:subsample'] = 1
param['silent'] = 1
param['nthread'] = 15

num_rounds = [105, 50, 50, 60, 65, 80, 40, 45, 45, 55, 35, 40, 40, 55]
plst = list(param.items())

for nc, c in enumerate(ycols):
    Gtrain = xgb.DMatrix( np.concatenate([trainV, Mtrain, Mtrain2, MtrainLin, MtrainLin2, MtrainLin4, MtrainX], axis=1), label = labels[c].values, missing = -1 )
    Gtest = xgb.DMatrix( np.concatenate([testV, Mtest, Mtest2, MtestLin, MtestLin2, MtestLin4, MtestX], axis=1), missing = -1 )

    bst = xgb.train( plst, Gtrain, num_rounds[nc] )
    pred = bst.predict( Gtest )
    submission[c] = pred

submission.to_csv('submissions/xgbFM.csv',index=False)

In [None]:
import theanets

loss = []
nn_predF = []
for nc,c in enumerate(ycols):

    ntrain = np.concatenate([Mtrain2, Mtrain, MtrainLin, MtrainLin2, MtrainLin4, MtrainX], axis=1).astype(np.float32)
    ncv = np.concatenate([Mtest2, Mtest, MtestLin, MtestLin2, MtestLin4, MtestX], axis=1).astype(np.float32)
    exp = theanets.Experiment(
        theanets.Classifier,
        layers=(6*14, 5, 2),
        train_batches = 1000,
    )

    exp.train(
        (ntrain, labels[c].values.astype(np.int32)),
    )

    pred = exp.network.predict(ncv)[:,1]
    nn_predF += [pred]
    
submission = sampleSubmission.copy()

for nc,c in enumerate(ycols):
    submission[c] = nn_predF[nc]
    
submission.to_csv('submissions/nnM.csv',index=False)

In [None]:
submission = sampleSubmission.copy()
rf = ExtraTreesClassifier(n_estimators=1000, n_jobs=15, criterion='entropy', max_features=200)

for nc,c in enumerate(ycols):
    imp_cols = np.arange(trainV.shape[1])[rf_impV[nc] > 0.0001]
    
    rf.fit(trainV[imp_cols], labels[c])
    pred = rf.predict_proba(testV[imp_cols])[:,1]
         
    submission[c] = pred
    
submission.to_csv('submissions/rfF.csv',index=False)

In [None]:
submission = sampleSubmission.copy()

param = {}
param['booster'] = 'gbtree'
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'logloss'
param['scale_pos_weight'] = 1.0
param['bst:eta'] = 0.25
param['bst:max_depth'] = 5
param['bst:colsample_bytree'] = 0.25
param['silent'] = 1
param['nthread'] = 15

num_round = 120
plst = list(param.items())

xgb_predF = []

loss = []
for i,c in enumerate(ycols[:]):

    Gtrain = xgb.DMatrix( trainV.values, label = labels[c].values, missing = -1 )
    Gtest = xgb.DMatrix( testV.values, missing = -1 )
    
    bst = xgb.train( plst, Gtrain, num_round )
    pred = bst.predict( Gtest )
    
    submission[c] = pred
    
submission.to_csv('submissions/xgbF.csv',index=False)

In [None]:
logging.info('')

submission = sampleSubmission.copy()

rf = ExtraTreesClassifier(n_estimators=200, n_jobs=15, criterion='entropy', max_features=320)

for c in ycols:
    impcols = np.arange(trainV.shape[1])[rf_impV[nc] > 0.0001]
    rf.fit(np.concatenate([trainV[impcols], Mtrain, Mtrain2, MtrainLin, MtrainLin2, MtrainLin4, MtrainX], axis=1), labels[c])
    pred = rf.predict_proba(np.concatenate([testV[impcols], Mtest, Mtest2, MtestLin, MtestLin2, MtestLin4, MtestX], axis=1))[:,1]

    submission[c] = pred
    
submission.to_csv('submissions/rfFM.csv',index=False)

logging.info('')

In [None]:
sub1 = read_csv('submissions/nnM.csv')
sub2 = read_csv('submissions/rfFM.csv')
sub3 = read_csv('submissions/xgbFM.csv')
sub4 = read_csv('submissions/rfF.csv')
sub5 = read_csv('submissions/xgbF.csv')

In [None]:
submissionMix = sampleSubmission.copy()

for c in ycols:
    pred = (sub1[c].values * 1/3 + sub2[c].values * 1/3 + sub3[c].values * 1/3) * .75 + .25 * (sub4[c] * .5 + .5 * sub5[c])
    
    submissionMix[c] = pred
    
submissionMix.to_csv('submissions/final_submit.csv',index=False)