In [1]:
import gc
import xgboost as xgb 
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA, TruncatedSVD
import os  # for Macbook
os.environ['KMP_DUPLICATE_LIB_OK']='True'

### Load Data

In [2]:
print('Started Loading data...')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
print('Done!')

Started Loading data...
Done!


### Rename the columns and extract the labels

In [3]:
print(train.columns.shape)
target = train['TARGET']
train.drop(['TARGET'], axis = 1, inplace = True)
train.columns.shape

(371,)


(370,)

### Remove Duplicated Features, including zero columns

In [4]:
print('Removing Duplicated Columns....')
remove = []
c = train.columns
for i in range(len(c)-1):
    v = train[c[i]].values
    for j in range(i+1, len(c)):
        if np.array_equal(v, train[c[j]].values):
            remove.append(c[j])
print(np.shape(remove))
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
print('Done!')

Removing Duplicated Columns....
(590,)
Done!


### Remove constant features

In [5]:
print('Removing constant columns....')
remove = []
for col in train.columns:
    if len(train[col].unique()) == 0:
        remove.append(col)
print(np.shape(remove))
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)
print('Done!')

Removing constant columns....
(0,)
Done!


### Add a feature that counts number of assets

In [6]:
original_features = train.columns[1:] #为什么排除了最后一个？
print(train.shape)
train['SumZeros'] = (train[original_features] == 0).sum(axis=1)
test['SumZeros'] = (test[original_features] == 0).sum(axis=1)
train.shape

(76020, 308)


(76020, 309)

In [9]:
train.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38',
       'SumZeros'],
      dtype='object', length=309)

### Generate PCA features

In [7]:
df = pd.concat([train, test], axis = 0)
df.shape
n_train = train.shape[0] # for split the df back into train, test later
n_test = test.shape[0]

def generate_PCA_feature(train, test, original_feature, n_components = 4):
    """
    we fit a PCA decomposition model to training set of the shape
    : n_train_samples * n_features
    and transform on both training set and testing test. 
    """
    pca = PCA(n_components = n_components)
    # we need to normalize the data before fitting
    train_projected = pca.fit_transform(normalize(train[original_features], axis=0))
    test_projected = pca.transform(normalize(test[original_features], axis=0))
    for i in range(1, n_components + 1):
        name = 'PCA{:02d}'.format(i)
        train[name] = train_projected[:, i - 1]
        test[name] = test_projected[:, i - 1]
    return train, test
print('Generating PCA features')
train, test = generate_PCA_feature(train, test, original_features, n_components = 4)
print('Done!') 

Generating PCA features
Done!


In [8]:
train.columns #  more features added, as expected

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'SumZeros',
       'PCA01', 'PCA02', 'PCA03', 'PCA04'],
      dtype='object', length=313)

### Truncated SVD features for nonlinear clustering

In [58]:
# we perform cross validation to set the n_components
def generate_SVD_feature(train, test, original_features, n_components = 5):
    """
    we fit a PCA decomposition model to training set of the shape
    : n_train_samples * n_features
    and transform on both training set and testing test. 
    """
    svd = TruncatedSVD(n_components = n_components)
    # we need to normalize the data before fitting
    train_projected = svd.fit_transform(train[original_features])
    test_projected = svd.transform(test[original_features])
    for i in range(1, n_components + 1):
        name = 'SVD{:02d}'.format(i)
        train[name] = train_projected[:, i - 1]
        test[name] = test_projected[:, i - 1]
    return train, test
print('Generating SVD features')
train, test = generate_SVD_feature(train, test, original_features, n_components = 5)
print('Done!')

Generating SVD features
Done!


In [59]:
train.shape # 5 more features added, as expected

(76020, 316)

## XGBoost

### 10-fold cross validation splits

In [22]:
split = 10
skf = StratifiedKFold(n_splits = split)
skf.get_n_splits(train, target)

10

### Set parameters

In [10]:
num_boost_round = 10
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03 # 如同学习率
params["subsample"] = 1  # 随机采样训练样本 训练实例的子采样比
params["colsample_bytree"] = 0.7 # 生成树时进行的列采样
params["silent"] = 0 #设置成1则没有运行信息输出，最好是设置为0.是否在运行升级时打印消息。
params["max_depth"] = 5 # 构建树的深度，越大越容易过拟合
params["min_child_weight"] = 1
# 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
#，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
params["eval_metric"] = "auc"

In [11]:
train['target'] = target # add back the label
features = train.columns[1:-1] # exclude ID

In [13]:
features

Index(['var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3', 'imp_op_var40_ult1',
       ...
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'SumZeros',
       'PCA01', 'PCA02', 'PCA03', 'PCA04'],
      dtype='object', length=312)

In [12]:
train.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var44_hace3', 'saldo_medio_var44_ult1',
       'saldo_medio_var44_ult3', 'var38', 'SumZeros', 'PCA01', 'PCA02',
       'PCA03', 'PCA04', 'target'],
      dtype='object', length=314)

In [18]:
train_preds = None
test_preds = None
xgb_classifiers = [] # List[[clf, score]]
index = 0
for train_index, test_index in skf.split(train, target):
    print('Fold:', index + 1)
    index = index + 1
    X_train, X_test = train.iloc[train_index], train.iloc[test_index] # split training set into'train', 'cross-validation' sets
#    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

    # for xgb classifier, we transform them into DMatrix format
    D_train = xgb.DMatrix(
                    csr_matrix(X_train[features]),
                    X_train.target.values,
                    silent = True)
#    print(D_train.num_col())
#    print(D_train.num_row())
    
    D_test = xgb.DMatrix(
                    csr_matrix(X_test[features]),
                    X_test.target.values,
                    silent = True)
    watchlist = [(D_test, 'eval'), (D_train, 'train')]
#    print(D_test.num_col())
#    print(D_test.num_row())
  
    # fit the classfier now
    clf = xgb.train(params, D_train, num_boost_round,
                    evals = watchlist)
    
    test_prediction = clf.predict(D_test)
    print('Blind Log Loss:', log_loss(X_test.target.values, test_prediction))
    score = roc_auc_score(X_test.target.values, test_prediction)
    print('Blind ROC:', score)
    
    
    del X_train, X_test, D_train, D_test
    gc.collect()
    print('finished a training model')
    print('fitting on full data set now...')
    
    D_full_train = \
        xgb.DMatrix(csr_matrix(train[features]),
                    train.values,
                    silent = True)
    D_full_test = \
        xgb.DMatrix(csr_matrix(test[features]),
                    silent = True)
    if(train_preds is None):
        train_preds = clf.predict(D_full_train)
        test_preds = clf.predict(D_full_test)
    else:
        train_preds *= clf.predict(D_full_train) # we manually perform an average of the results
        test_preds *= clf.predict(D_full_test)
    xgb_classifiers.append([clf, 'with auc score: {:10f}'.format(score)])
    del D_full_train, D_full_test, clf
    gc.collect()


print('Done!')

Fold: 1
[0]	eval-auc:0.806581	train-auc:0.818774
[1]	eval-auc:0.812097	train-auc:0.823918
[2]	eval-auc:0.811927	train-auc:0.825647
[3]	eval-auc:0.818346	train-auc:0.831688
[4]	eval-auc:0.820401	train-auc:0.834072
[5]	eval-auc:0.818017	train-auc:0.835681
[6]	eval-auc:0.816894	train-auc:0.835054
[7]	eval-auc:0.819524	train-auc:0.83583
[8]	eval-auc:0.822144	train-auc:0.837802
[9]	eval-auc:0.822591	train-auc:0.838328
Blind Log Loss: 0.4937098970518886
Blind ROC: 0.8225910436407082
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 2
[0]	eval-auc:0.803856	train-auc:0.820801
[1]	eval-auc:0.806752	train-auc:0.824244
[2]	eval-auc:0.805828	train-auc:0.826292
[3]	eval-auc:0.809933	train-auc:0.830363
[4]	eval-auc:0.810826	train-auc:0.833659
[5]	eval-auc:0.81062	train-auc:0.834969
[6]	eval-auc:0.81135	train-auc:0.835214
[7]	eval-auc:0.810702	train-auc:0.834916
[8]	eval-auc:0.810871	train-auc:0.835285
[9]	eval-auc:0.81134	train-auc:0.837399
Blind Log Loss: 0.49409158294034383
Blind ROC: 0.8113396320673079
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 3
[0]	eval-auc:0.779226	train-auc:0.8215
[1]	eval-auc:0.781528	train-auc:0.825961
[2]	eval-auc:0.783613	train-auc:0.829467
[3]	eval-auc:0.789654	train-auc:0.831055
[4]	eval-auc:0.795079	train-auc:0.832715
[5]	eval-auc:0.796618	train-auc:0.833589
[6]	eval-auc:0.797567	train-auc:0.834818
[7]	eval-auc:0.798396	train-auc:0.835552
[8]	eval-auc:0.79646	train-auc:0.837819
[9]	eval-auc:0.796137	train-auc:0.839325
Blind Log Loss: 0.49392594726767486
Blind ROC: 0.7961370148630256
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 4
[0]	eval-auc:0.801286	train-auc:0.821889
[1]	eval-auc:0.812263	train-auc:0.825834
[2]	eval-auc:0.816714	train-auc:0.830877
[3]	eval-auc:0.817473	train-auc:0.832079
[4]	eval-auc:0.821068	train-auc:0.834969
[5]	eval-auc:0.823229	train-auc:0.83667
[6]	eval-auc:0.824621	train-auc:0.836901
[7]	eval-auc:0.825458	train-auc:0.837343
[8]	eval-auc:0.824894	train-auc:0.838034
[9]	eval-auc:0.82467	train-auc:0.838986
Blind Log Loss: 0.49359754970656416
Blind ROC: 0.8246701744311183
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 5
[0]	eval-auc:0.817661	train-auc:0.816289
[1]	eval-auc:0.818437	train-auc:0.824787
[2]	eval-auc:0.824175	train-auc:0.827308
[3]	eval-auc:0.823242	train-auc:0.828236
[4]	eval-auc:0.82777	train-auc:0.830826
[5]	eval-auc:0.836619	train-auc:0.833031
[6]	eval-auc:0.835362	train-auc:0.835501
[7]	eval-auc:0.834731	train-auc:0.836303
[8]	eval-auc:0.836016	train-auc:0.836942
[9]	eval-auc:0.838184	train-auc:0.838867
Blind Log Loss: 0.4937996398623257
Blind ROC: 0.8381844565960791
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 6
[0]	eval-auc:0.798024	train-auc:0.817064
[1]	eval-auc:0.81098	train-auc:0.82189
[2]	eval-auc:0.810497	train-auc:0.824778
[3]	eval-auc:0.816763	train-auc:0.830633
[4]	eval-auc:0.819677	train-auc:0.835548
[5]	eval-auc:0.822073	train-auc:0.837357
[6]	eval-auc:0.820862	train-auc:0.836281
[7]	eval-auc:0.820939	train-auc:0.836889
[8]	eval-auc:0.822384	train-auc:0.838222
[9]	eval-auc:0.821242	train-auc:0.839935
Blind Log Loss: 0.49381750898513627
Blind ROC: 0.8212421181097024
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 7
[0]	eval-auc:0.809979	train-auc:0.818891
[1]	eval-auc:0.812038	train-auc:0.820903
[2]	eval-auc:0.820035	train-auc:0.827355
[3]	eval-auc:0.818185	train-auc:0.826191
[4]	eval-auc:0.829449	train-auc:0.832155
[5]	eval-auc:0.832453	train-auc:0.833507
[6]	eval-auc:0.830299	train-auc:0.832733
[7]	eval-auc:0.829449	train-auc:0.833273
[8]	eval-auc:0.830948	train-auc:0.834106
[9]	eval-auc:0.832619	train-auc:0.835899
Blind Log Loss: 0.4936103325781149
Blind ROC: 0.8326188420918993
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 8
[0]	eval-auc:0.835477	train-auc:0.818797
[1]	eval-auc:0.841833	train-auc:0.82628
[2]	eval-auc:0.843104	train-auc:0.829462
[3]	eval-auc:0.843898	train-auc:0.831376
[4]	eval-auc:0.845291	train-auc:0.832395
[5]	eval-auc:0.849459	train-auc:0.833487
[6]	eval-auc:0.849748	train-auc:0.83464
[7]	eval-auc:0.848143	train-auc:0.834495
[8]	eval-auc:0.848919	train-auc:0.835191
[9]	eval-auc:0.848375	train-auc:0.836302
Blind Log Loss: 0.4933071628632404
Blind ROC: 0.8483751144998568
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 9
[0]	eval-auc:0.815119	train-auc:0.818708
[1]	eval-auc:0.816439	train-auc:0.821709
[2]	eval-auc:0.819236	train-auc:0.82868
[3]	eval-auc:0.820221	train-auc:0.828947
[4]	eval-auc:0.820632	train-auc:0.830498
[5]	eval-auc:0.821824	train-auc:0.832299
[6]	eval-auc:0.824613	train-auc:0.833981
[7]	eval-auc:0.825032	train-auc:0.834215
[8]	eval-auc:0.825546	train-auc:0.834775
[9]	eval-auc:0.827695	train-auc:0.836077
Blind Log Loss: 0.4935291192865955
Blind ROC: 0.8276950646030224
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Fold: 10
[0]	eval-auc:0.791223	train-auc:0.824929
[1]	eval-auc:0.797652	train-auc:0.831144
[2]	eval-auc:0.801832	train-auc:0.832629
[3]	eval-auc:0.803331	train-auc:0.832796
[4]	eval-auc:0.805368	train-auc:0.835988
[5]	eval-auc:0.806129	train-auc:0.835941
[6]	eval-auc:0.80654	train-auc:0.835957
[7]	eval-auc:0.808174	train-auc:0.836883
[8]	eval-auc:0.810074	train-auc:0.837826
[9]	eval-auc:0.812566	train-auc:0.839174
Blind Log Loss: 0.4941798559051958
Blind ROC: 0.8125658585581884
finished a training model
fitting on full data set now...


  "because it will generate extra copies and increase memory consumption")


Done!


### save the xgb_classifiers lists for later ensemble

In [19]:
pickle.dump(xgb_classifiers, open('xgboost_classifier_param1.dat', 'wb'))

In [20]:
data = pickle.load(open('xgboost_classifier_param1.dat', 'rb'))

### set another set of parameters, use a randomized search

In [25]:
split = 5
random_state = 111
skf = StratifiedKFold(target,
                      n_folds=split,
                      shuffle=False,
                      random_state= random_state)
num_rounds = 350 + np.random.randint(low = -50, high = 50)
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03 + np.random.normal(loc = 0.0, scale = 0.01)
params["subsample"] = 0.8
params["colsample_bytree"] = 0.7
params["silent"] = 1
params["max_depth"] = 5 + np.random.randint(low = -1, high = 2)
params["min_child_weight"] = 1
params["eval_metric"] = "auc"

In [46]:
train_preds = None
test_preds = None
xgb_classifiers2 = []
for index, (train_index, test_index) in enumerate(skf):
    print('Fold:', index)
    X_train = train.iloc[train_index] # split training set into'train', 'cross-validation' sets
    X_test = train.iloc[test_index]
    
    # for xgb classifier, we transform them into DMatrix format
    D_train = xgb.DMatrix(
                    csr_matrix(X_train[features]),
                    X_train.target.values,
                    silent=True)
    
    D_test = xgb.DMatrix(
                    csr_matrix(X_test[features]),
                    X_test.target.values,
                    silent=True)
    watchlist = [(D_test, 'eval'), (D_train, 'train')]
    
    # fit the classfier now
    clf = xgb.train(params, D_train, num_rounds,
                    evals = watchlist, early_stopping_rounds=50,
                    verbose_eval=False)

    test_prediction = clf.predict(D_test)
    print('Blind Log Loss:', log_loss(X_test.target.values,
                                      test_prediction))
    score = roc_auc_score(X_test.target.values,
                                      test_prediction)
    print('Blind ROC:', score)
    index = index + 1
    
    del X_train, X_test, D_train, D_test
    gc.collect()
    print 'finished a training model'
    print 'fitting on full data set now...'
    
    D_full_train = \
        xgb.DMatrix(csr_matrix(train[features]),
                    train.target.values,
                    silent=True)
    D_full_test = \
        xgb.DMatrix(csr_matrix(test[features]),
                    silent=True)
    if(train_preds is None):
        train_preds = clf.predict(D_full_train)
        test_preds = clf.predict(D_full_test)
    else:
        train_preds *= clf.predict(D_full_train) # we manually perform an average of the results
        test_preds *= clf.predict(D_full_test)
    xgb_classifiers2.append([clf, 'with auc score: {:10f}'.format(score)])
    del D_full_train, D_full_test, clf
    gc.collect()
print 'Done!'

('Fold:', 0)
('Blind Log Loss:', 0.13515509804987907)
('Blind ROC:', 0.83580701685336112)
finished a training model
fitting on full data set now...
('Fold:', 1)
('Blind Log Loss:', 0.13680937251164763)
('Blind ROC:', 0.82864225095512389)
finished a training model
fitting on full data set now...
('Fold:', 2)
('Blind Log Loss:', 0.13302371896058762)
('Blind ROC:', 0.84442239514816397)
finished a training model
fitting on full data set now...
('Fold:', 3)
('Blind Log Loss:', 0.12968091185474287)
('Blind ROC:', 0.85590906677247269)
finished a training model
fitting on full data set now...
('Fold:', 4)
('Blind Log Loss:', 0.13412376736040632)
('Blind ROC:', 0.83843641868857111)
finished a training model
fitting on full data set now...
Done!


In [48]:
print params

{'colsample_bytree': 0.7, 'silent': 1, 'eval_metric': 'auc', 'min_child_weight': 1, 'subsample': 0.8, 'eta': 0.019899065033745854, 'objective': 'binary:logistic', 'max_depth': 5}


In [23]:
train_preds = np.power(train_preds, 1./split)
test_preds = np.power(test_preds, 1./split)
print('Average Log Loss:', log_loss(train.target.values, train_preds))
print('Average ROC:', roc_auc_score(train.target.values, train_preds))

Average Log Loss: 0.49344423582242747
Average ROC: 0.8404411930500204


In [24]:
submission = pd.DataFrame({"ID": train.ID,
                           "TARGET": train.target,
                           "PREDICTION": train_preds})

submission.to_csv("simplexgbtrain.csv", index=False)
submission = pd.DataFrame({"ID": test.ID, "TARGET": test_preds})
submission.to_csv("simplexgbtest.csv", index=False)
print('Finish')

Finish
