In [26]:
import gc
import xgboost as xgb
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA, TruncatedSVD

### Load Data

In [2]:
print('Started Loading data...')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print('Done!')

Started Loading data...
Done!


### Rename the columns and extract the labels

In [3]:
target = train['TARGET']
train.drop(['TARGET'], axis = 1, inplace = True)

### Remove Duplicate Features, including zero columns

In [4]:
print 'Removing Duplicate Columns....'
remove = []
c = train.columns
for i in range(len(c)-1):
    v = train[c[i]].values
    for j in range(i+1, len(c)):
        if np.array_equal(v, train[c[j]].values):
            remove.append(c[j])
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
print 'Done!'

Removing Duplicate Columns....
Done!


### remove constant features

In [5]:
print 'Removing constant columns....'
remove = []
for col in train.columns:
    if len(train[col].unique()) == 0:
        remove.append(col)
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
print 'Done!'

Removing constant columns....
Done!


### add a feature that counts number of assets

In [6]:
original_features = train.columns[1:-1]
train['SumZeros'] = (train[original_features] == 0).sum(axis=1)
test['SumZeros'] = (test[original_features] == 0).sum(axis=1)

### generate PCA features

In [7]:
df = pd.concat([train, test], axis = 0)
df.shape
n_train = train.shape[0] # for split the df back into train, test later
n_test = test.shape[0]

def generate_PCA_feature(train, test, original_feature, n_components = 4):
    """
    we fit a PCA decomposition model to training set of the shape
    : n_train_samples * n_features
    and transform on both training set and testing test. 
    """
    pca = PCA(n_components = n_components)
    # we need to normalize the data before fitting
    train_projected = pca.fit_transform(normalize(train[original_features], axis=0))
    test_projected = pca.transform(normalize(test[original_features], axis=0))
    for i in xrange(1, n_components + 1):
        name = 'PCA{:02d}'.format(i)
        train[name] = train_projected[:, i - 1]
        test[name] = test_projected[:, i - 1]
    return train, test
print 'Generating PCA features'
train, test = generate_PCA_feature(train, test, original_features, n_components = 2)
print 'Done!'

Generating PCA features
Done!


In [8]:
train.shape # two more features added, as expected

(76020, 311)

### truncated SVD features for nonlinear clustering

In [9]:
# we perform cross validation to set the n_components
def generate_SVD_feature(train, test, original_features, n_components = 5):
    """
    we fit a PCA decomposition model to training set of the shape
    : n_train_samples * n_features
    and transform on both training set and testing test. 
    """
    svd = TruncatedSVD(n_components = n_components)
    # we need to normalize the data before fitting
    train_projected = svd.fit_transform(train[original_features])
    test_projected = svd.transform(test[original_features])
    for i in xrange(1, n_components + 1):
        name = 'SVD{:02d}'.format(i)
        train[name] = train_projected[:, i - 1]
        test[name] = test_projected[:, i - 1]
    return train, test
print 'Generating SVD features'
train, test = generate_SVD_feature(train, test, original_features, n_components = 5)
print 'Done!'

Generating SVD features
Done!


In [10]:
train.shape # 5 more features added, as expected

(76020, 316)

### 10-fold cross validation splits

In [11]:
features = train.columns[1:-1]
split = 10
skf = StratifiedKFold(target,
                      n_folds=split,
                      shuffle=False,
                      random_state=42)

### Set parameters for XGBosst Model

In [12]:
num_rounds = 350
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03
params["subsample"] = 0.8
params["colsample_bytree"] = 0.7
params["silent"] = 1
params["max_depth"] = 5
params["min_child_weight"] = 1
params["eval_metric"] = "auc"

In [13]:
train.columns

Index([u'ID', u'var3', u'var15', u'imp_ent_var16_ult1',
       u'imp_op_var39_comer_ult1', u'imp_op_var39_comer_ult3',
       u'imp_op_var40_comer_ult1', u'imp_op_var40_comer_ult3',
       u'imp_op_var40_efect_ult1', u'imp_op_var40_efect_ult3',
       ...
       u'saldo_medio_var44_ult3', u'var38', u'SumZeros', u'PCA01', u'PCA02',
       u'SVD01', u'SVD02', u'SVD03', u'SVD04', u'SVD05'],
      dtype='object', length=316)

In [14]:
train['target'] = target # add back the label
features = train.columns[1:-1] # exclude ID

In [15]:
train_preds = None
test_preds = None
xgb_classifiers = [] # List[[clf, score]]
for index, (train_index, test_index) in enumerate(skf):
    print('Fold:', index)
    X_train = train.iloc[train_index] # split training set into'train', 'cross-validation' sets
    X_test = train.iloc[test_index]
    
    # for xgb classifier, we transform them into DMatrix format
    D_train = xgb.DMatrix(
                    csr_matrix(X_train[features]),
                    X_train.target.values,
                    silent=True)
    
    D_test = xgb.DMatrix(
                    csr_matrix(X_test[features]),
                    X_test.target.values,
                    silent=True)
    watchlist = [(D_test, 'eval'), (D_train, 'train')]
    
    # fit the classfier now
    clf = xgb.train(params, D_train, num_rounds,
                    evals = watchlist, early_stopping_rounds=50,
                    verbose_eval=False)

    test_prediction = clf.predict(D_test)
    print('Blind Log Loss:', log_loss(X_test.target.values,
                                      test_prediction))
    score = roc_auc_score(X_test.target.values,
                                      test_prediction)
    print('Blind ROC:', score)
    index = index + 1
    
    del X_train, X_test, D_train, D_test
    gc.collect()
    print 'finished a training model'
    print 'fitting on full data set now...'
    
    D_full_train = \
        xgb.DMatrix(csr_matrix(train[features]),
                    train.target.values,
                    silent=True)
    D_full_test = \
        xgb.DMatrix(csr_matrix(test[features]),
                    silent=True)
    if(train_preds is None):
        train_preds = clf.predict(D_full_train)
        test_preds = clf.predict(D_full_test)
    else:
        train_preds *= clf.predict(D_full_train) # we manually perform an average of the results
        test_preds *= clf.predict(D_full_test)
    xgb_classifiers.append([clf, 'with auc score: {:10f}'.format(score)])
    del D_full_train, D_full_test, clf
    gc.collect()
print 'Done!'

('Fold:', 1)
('Blind Log Loss:', 0.13505756484651632)
('Blind ROC:', 0.83806079615924634)
finished a training model
fitting on full data set now...
('Fold:', 2)
('Blind Log Loss:', 0.1341003199256714)
('Blind ROC:', 0.83880423239980662)
finished a training model
fitting on full data set now...
('Fold:', 3)
('Blind Log Loss:', 0.14165757352950137)
('Blind ROC:', 0.81413527751397996)
finished a training model
fitting on full data set now...
('Fold:', 4)
('Blind Log Loss:', 0.13285799161095355)
('Blind ROC:', 0.84142344310909944)
finished a training model
fitting on full data set now...
('Fold:', 5)
('Blind Log Loss:', 0.13225275204203996)
('Blind ROC:', 0.84591311161580274)
finished a training model
fitting on full data set now...
('Fold:', 6)
('Blind Log Loss:', 0.13416997346358403)
('Blind ROC:', 0.84142412567158464)
finished a training model
fitting on full data set now...
('Fold:', 7)
('Blind Log Loss:', 0.13143148793218037)
('Blind ROC:', 0.85123163850034667)
finished a training mod

### save the xgb_classifiers lists for later ensemble

In [29]:
pickle.dump(xgb_classifiers, open('xgboost_classifier_param1.dat', 'wb'))

In [31]:
data = pickle.load(open('xgboost_classifier_param1.dat', 'rb'))

### set another set of parameters, use a randomized search

In [25]:
split = 5
random_state = 111
skf = StratifiedKFold(target,
                      n_folds=split,
                      shuffle=False,
                      random_state= random_state)
num_rounds = 350 + np.random.randint(low = -50, high = 50)
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03 + np.random.normal(loc = 0.0, scale = 0.01)
params["subsample"] = 0.8
params["colsample_bytree"] = 0.7
params["silent"] = 1
params["max_depth"] = 5 + np.random.randint(low = -1, high = 2)
params["min_child_weight"] = 1
params["eval_metric"] = "auc"

In [46]:
train_preds = None
test_preds = None
xgb_classifiers2 = []
for index, (train_index, test_index) in enumerate(skf):
    print('Fold:', index)
    X_train = train.iloc[train_index] # split training set into'train', 'cross-validation' sets
    X_test = train.iloc[test_index]
    
    # for xgb classifier, we transform them into DMatrix format
    D_train = xgb.DMatrix(
                    csr_matrix(X_train[features]),
                    X_train.target.values,
                    silent=True)
    
    D_test = xgb.DMatrix(
                    csr_matrix(X_test[features]),
                    X_test.target.values,
                    silent=True)
    watchlist = [(D_test, 'eval'), (D_train, 'train')]
    
    # fit the classfier now
    clf = xgb.train(params, D_train, num_rounds,
                    evals = watchlist, early_stopping_rounds=50,
                    verbose_eval=False)

    test_prediction = clf.predict(D_test)
    print('Blind Log Loss:', log_loss(X_test.target.values,
                                      test_prediction))
    score = roc_auc_score(X_test.target.values,
                                      test_prediction)
    print('Blind ROC:', score)
    index = index + 1
    
    del X_train, X_test, D_train, D_test
    gc.collect()
    print 'finished a training model'
    print 'fitting on full data set now...'
    
    D_full_train = \
        xgb.DMatrix(csr_matrix(train[features]),
                    train.target.values,
                    silent=True)
    D_full_test = \
        xgb.DMatrix(csr_matrix(test[features]),
                    silent=True)
    if(train_preds is None):
        train_preds = clf.predict(D_full_train)
        test_preds = clf.predict(D_full_test)
    else:
        train_preds *= clf.predict(D_full_train) # we manually perform an average of the results
        test_preds *= clf.predict(D_full_test)
    xgb_classifiers2.append([clf, 'with auc score: {:10f}'.format(score)])
    del D_full_train, D_full_test, clf
    gc.collect()
print 'Done!'

('Fold:', 0)
('Blind Log Loss:', 0.13515509804987907)
('Blind ROC:', 0.83580701685336112)
finished a training model
fitting on full data set now...
('Fold:', 1)
('Blind Log Loss:', 0.13680937251164763)
('Blind ROC:', 0.82864225095512389)
finished a training model
fitting on full data set now...
('Fold:', 2)
('Blind Log Loss:', 0.13302371896058762)
('Blind ROC:', 0.84442239514816397)
finished a training model
fitting on full data set now...
('Fold:', 3)
('Blind Log Loss:', 0.12968091185474287)
('Blind ROC:', 0.85590906677247269)
finished a training model
fitting on full data set now...
('Fold:', 4)
('Blind Log Loss:', 0.13412376736040632)
('Blind ROC:', 0.83843641868857111)
finished a training model
fitting on full data set now...
Done!


In [48]:
print params

{'colsample_bytree': 0.7, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 1, 'subsample': 0.8, 'eta': 0.019899065033745854, 'objective': 'binary:logistic', 'max_depth': 5}


In [47]:
train_preds = np.power(train_preds, 1./split)
test_preds = np.power(test_preds, 1./split)
print('Average Log Loss:', log_loss(train.target.values, train_preds))
print('Average ROC:', roc_auc_score(train.target.values, train_preds))

('Average Log Loss:', 0.12388091950835343)
('Average ROC:', 0.87960446479360432)


In [None]:
submission = pd.DataFrame({"ID": train.ID,
                           "TARGET": train.target,
                           "PREDICTION": train_preds})

submission.to_csv("simplexgbtrain.csv", index=False)
submission = pd.DataFrame({"ID": test.ID, "TARGET": test_preds})
submission.to_csv("simplexgbtest.csv", index=False)
print('Finish')