In [17]:
import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from sklearn import ensemble
from sklearn.metrics import log_loss

In [18]:
print('Load data...')

np.random.seed(seed = 199)

train = pd.read_csv("/Users/rogetoon/BNPParibasKaggle/data/train_stacking.csv")
train['rand'] = np.ceil(5 * np.random.random_sample(len(train.index)))
test = pd.read_csv("/Users/rogetoon/BNPParibasKaggle/data/test_stacking.csv")
test['rand'] = np.ceil(5 * np.random.random_sample(len(test.index)))

# ensure we have same variables and in same order
cols = train.columns.tolist()
test['target'] = 1
test = test[cols]

Load data...


In [19]:
print('Prepare data...')

for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(), test.iteritems()):
    if train_series.dtype == 'O':
        #for objects: factorize
        train[train_name], tmp_indexer = pd.factorize(train[train_name])
        test[test_name] = tmp_indexer.get_indexer(test[test_name])
        #but now we have -1 values (NaN)
    else:
        #for int or float: fill NaN
        tmp_len = len(train[train_series.isnull()])
        if tmp_len > 0:
            #print "mean", train_series.mean()
            train.loc[train_series.isnull(), train_name] = -999 
        #and Test
        tmp_len = len(test[test_series.isnull()])
        if tmp_len > 0:
            test.loc[test_series.isnull(), test_name] = -999

Prepare data...


In [20]:
folds = range(1, 6)

scores = []
test_out = pd.DataFrame([])
stacker_out = pd.DataFrame([])

for fold in folds:
        print('Run interation ' + str(fold) + '...')
        print('  Create folds...')
        # in fold - for training
        X_train = train[train['rand'] != fold].drop(['ID', 'target', 'rand'], axis=1)
        y_train = train[train['rand'] != fold]['target'].values       
        # out of fold - for predictions
        X_val = train[train['rand'] == fold].drop(['ID', 'target', 'rand'], axis=1)
        y_val = train[train['rand'] == fold]['target'].values
        X_test = test[test['rand'] == fold].drop(['ID', 'target', 'rand'], axis=1)
        # for storing meta features
        id_val = train[train['rand'] == fold]['ID'].values
        id_test = test[test['rand'] == fold]['ID'].values

        print('  Training...')
        extc = ExtraTreesClassifier(n_estimators=5000, max_features=60, 
                                    criterion='entropy', min_samples_split=4,
                                    max_depth=60, min_samples_leaf=2, n_jobs=-1)    
        extc.fit(X_train, y_train) 

        print('  Predicting...')
        y_val_pred = extc.predict_proba(X_val)[:,1]
        y_test_pred = extc.predict_proba(X_test)[:,1]
        test_out = test_out.append(pd.DataFrame({"ID": id_test, "PredictedProb": y_test_pred}))
        stacker_out = stacker_out.append(pd.DataFrame({"ID": id_test, "PredictedProb": y_test_pred}))
        stacker_out = stacker_out.append(pd.DataFrame({"ID": id_val, "PredictedProb": y_val_pred}))
        scores.append(metrics.log_loss(y_val, y_val_pred))

Run interation 1...
  Create folds...
  Training...
  Predicting...
Run interation 2...
  Create folds...
  Training...
  Predicting...
Run interation 3...
  Create folds...
  Training...
  Predicting...
Run interation 4...
  Create folds...
  Training...
  Predicting...
Run interation 5...
  Create folds...
  Training...
  Predicting...


In [21]:
print np.mean(scores)

0.44538750366


In [22]:
test_out.to_csv('/Users/rogetoon/BNPParibasKaggle/finished_models_output/extc_stacking_02_submission.csv',index=False) #0.446303945108
stacker_out.to_csv('/Users/rogetoon/BNPParibasKaggle/finished_models_output/extc_stacking_02_stacker.csv',index=False)