In [2]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from PreprocessingScripts import *

# import data from files
act_train_data = pd.read_csv(
    "c:/ml/redhat/input/act_train.csv",
    dtype = {
        'people_id': np.str, 
        'activity_id': np.str, 
        'outcome': np.int8
    }, 
    parse_dates=['date']
)

people_data = pd.read_csv(
    "c:/ml/redhat/input/people.csv",
    dtype = {
        'people_id': np.str, 
        'activity_id': np.str, 
        'char_38': np.int32
    }, 
    parse_dates=['date']
)

# massage data so that it's easier to work with and analyze
act_train_data  = act_data_treatment(act_train_data)
people_data = act_data_treatment(people_data)

# merge activity file with people file using people_id key
train = act_train_data.merge(
    people_data, 
    on='people_id', 
    how='left', 
    left_index=True
)

del act_train_data
del people_data

# sort data by people_id, [1] has one value of 1, means True for people_id
train=train.sort_values(['people_id'], ascending=[1])

# fill blanks with NA
train.fillna('NA', inplace=True)

# remove outcome from feature set
y = train.outcome
train=train.drop('outcome',axis=1)
train_columns = train.columns.values
features = list(set(train_columns))

# categorical columns in merged dataset
categorical=['group_1',
             'activity_category',
             'char_1_x',
             'char_2_x',
             'char_3_x',
             'char_4_x',
             'char_5_x',
             'char_6_x',
             'char_7_x',
             'char_8_x',
             'char_9_x',
             'char_10_x',
             'char_2_y',
             'char_3_y',
             'char_4_y',
             'char_5_y',
             'char_6_y',
             'char_7_y',
             'char_8_y',
             'char_9_y'
            ]

# reduce dimensionality of categorical features
for category in categorical:
    train=reduce_dimen(train,category,9999999)
    
# change variable name to X for convenience
X=train
del train
    
X=X.sort_values(['people_id'], ascending=[1])

# drop non-feature columns
X = X[features].drop(['people_id', 'activity_id'], axis = 1)

not_categorical=[]

for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)
        
# split X,y into training and validation set
from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

# convert categorical columns to numerical
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(X[categorical])
X_train_cat_sparse=enc.transform(X_train[categorical])
X_val_cat_sparse=enc.transform(X_val[categorical])

# stack sparse matrices
from scipy.sparse import hstack
X_train_sparse=hstack((X_train[not_categorical], X_train_cat_sparse))
X_val_sparse=hstack((X_val[not_categorical], X_val_cat_sparse))

dTrain = xgb.DMatrix(X_train_sparse,label=y_train)
dValidate = xgb.DMatrix(X_val_sparse,label=y_val)

# set classifier parameters
param = {'max_depth':11, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.8
param['colsample_bytree']= 0.8

# default booster 'gbtree' doesn't perform as well as 'gblinear'
param['booster'] = "gblinear"

# train model
watchlist = [(dValidate,'eval'), (dTrain,'train')]
num_round = 400
early_stopping_rounds=5

evals_result = {}

bst = xgb.train(
    param, 
    dTrain, 
    num_round, 
    watchlist,
    early_stopping_rounds=early_stopping_rounds,
    evals_result=evals_result
)

# validate
yPrediction = bst.predict(dValidate)
print(roc_auc_score(y_val, yPrediction))

[0]	eval-auc:0.889441	train-auc:0.889292
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 5 rounds.
[1]	eval-auc:0.898272	train-auc:0.898143
[2]	eval-auc:0.907268	train-auc:0.90717
[3]	eval-auc:0.915886	train-auc:0.915834
[4]	eval-auc:0.923916	train-auc:0.92391
[5]	eval-auc:0.931225	train-auc:0.931244
[6]	eval-auc:0.938026	train-auc:0.938076
[7]	eval-auc:0.94443	train-auc:0.944521
[8]	eval-auc:0.950463	train-auc:0.950586
[9]	eval-auc:0.956088	train-auc:0.956243
[10]	eval-auc:0.961248	train-auc:0.961439
[11]	eval-auc:0.965877	train-auc:0.966104
[12]	eval-auc:0.969952	train-auc:0.970216
[13]	eval-auc:0.973482	train-auc:0.973778
[14]	eval-auc:0.976494	train-auc:0.976822
[15]	eval-auc:0.979043	train-auc:0.979407
[16]	eval-auc:0.981181	train-auc:0.981576
[17]	eval-auc:0.982969	train-auc:0.983394
[18]	eval-auc:0.984461	train-auc:0.984913
[19]	eval-auc:0.985709	train-auc:0.986184
[20]	eval-auc:0.986756	train-au

In [5]:
(0.996772831138 - 0.966989848292) * 100 / 0.996772831138

2.987940874351204