In [2]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from PreprocessingScripts import *

# import data from files
act_train_data = pd.read_csv(
    "c:/ml/redhat/input/act_train.csv",
    dtype = {
        'people_id': np.str, 
        'activity_id': np.str, 
        'outcome': np.int8
    }, 
    parse_dates=['date']
)

people_data = pd.read_csv(
    "c:/ml/redhat/input/people.csv",
    dtype = {
        'people_id': np.str, 
        'activity_id': np.str, 
        'char_38': np.int32
    }, 
    parse_dates=['date']
)

# massage data so that it's easier to work with and analyze
act_train_data  = act_data_treatment(act_train_data)
people_data = act_data_treatment(people_data)

# merge activity file with people file using people_id key
train = act_train_data.merge(
    people_data, 
    on='people_id', 
    how='left', 
    left_index=True
)

del act_train_data
del people_data

# sort data by people_id, [1] has one value of 1, means True for people_id
train=train.sort_values(['people_id'], ascending=[1])

# fill blanks with NA
train.fillna('NA', inplace=True)

# remove outcome from feature set
y = train.outcome
train=train.drop('outcome',axis=1)
train_columns = train.columns.values
features = list(set(train_columns))

# categorical columns in merged dataset
categorical=['group_1',
             'activity_category',
             'char_1_x',
             'char_2_x',
             'char_3_x',
             'char_4_x',
             'char_5_x',
             'char_6_x',
             'char_7_x',
             'char_8_x',
             'char_9_x',
             'char_10_x',
             'char_2_y',
             'char_3_y',
             'char_4_y',
             'char_5_y',
             'char_6_y',
             'char_7_y',
             'char_8_y',
             'char_9_y'
            ]

# reduce dimensionality of categorical features
for category in categorical:
    train=reduce_dimen(train,category,9999999)
    
# change variable name to X for convenience
X=train
del train
    
X=X.sort_values(['people_id'], ascending=[1])

# drop non-feature columns
X = X[features].drop(['people_id', 'activity_id'], axis = 1)

categorical=['group_1',
             'activity_category',
             'char_1_x',
             'char_2_x',
             'char_3_x',
             'char_4_x',
             'char_5_x',
             'char_6_x',
             'char_7_x',
             'char_8_x',
             'char_9_x',
             'char_10_x',
             'char_2_y',
             'char_3_y',
             'char_4_y',
             'char_5_y',
             'char_6_y',
             'char_7_y',
             'char_8_y',
             'char_9_y'
            ]

not_categorical=[]

for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)
        
# split X,y into training and validation set
from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

# convert categorical columns to numerical
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(X[categorical])
X_train_cat_sparse=enc.transform(X_train[categorical])
X_val_cat_sparse=enc.transform(X_val[categorical])

# stack sparse matrices
from scipy.sparse import hstack
X_train_sparse=hstack((X_train[not_categorical], X_train_cat_sparse))
X_val_sparse=hstack((X_val[not_categorical], X_val_cat_sparse))

dTrain = xgb.DMatrix(X_train_sparse,label=y_train)
dValidate = xgb.DMatrix(X_val_sparse,label=y_val)

# set classifier parameters
param = {'max_depth':11, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.8
param['colsample_bytree']= 0.8
#param['booster'] = "gblinear"

# train model
watchlist  = [(dTrain,'train')]
num_round = 300
early_stopping_rounds=5

bst = xgb.train(
    param, 
    dTrain, 
    num_round, 
    watchlist,
    early_stopping_rounds=early_stopping_rounds
)

# validate
yPrediction = bst.predict(dValidate)
print(roc_auc_score(y_val, yPrediction))

[0]	train-auc:0.921986
Will train until train-auc hasn't improved in 5 rounds.
[1]	train-auc:0.929154
[2]	train-auc:0.930506
[3]	train-auc:0.931017
[4]	train-auc:0.930856
[5]	train-auc:0.930924
[6]	train-auc:0.931907
[7]	train-auc:0.932081
[8]	train-auc:0.932274
[9]	train-auc:0.932119
[10]	train-auc:0.932386
[11]	train-auc:0.932542
[12]	train-auc:0.932603
[13]	train-auc:0.933203
[14]	train-auc:0.933275
[15]	train-auc:0.933341
[16]	train-auc:0.933457
[17]	train-auc:0.933644
[18]	train-auc:0.933764
[19]	train-auc:0.933925
[20]	train-auc:0.934184
[21]	train-auc:0.934474
[22]	train-auc:0.93484
[23]	train-auc:0.935176
[24]	train-auc:0.935181
[25]	train-auc:0.935305
[26]	train-auc:0.935313
[27]	train-auc:0.935343
[28]	train-auc:0.935576
[29]	train-auc:0.9358
[30]	train-auc:0.935869
[31]	train-auc:0.935967
[32]	train-auc:0.93622
[33]	train-auc:0.936281
[34]	train-auc:0.936265
[35]	train-auc:0.936305
[36]	train-auc:0.936498
[37]	train-auc:0.936729
[38]	train-auc:0.93705
[39]	train-auc:0.937287

In [3]:
from matplotlib import pylab as plt

def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.get_fscore()
    imp_dict = {
        feat_names[i]:float(imp_vals.get('f'  +str(i), 0.)) 
        for i in range(len(feat_names))
    }
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}

importance = get_xgb_imp(bst,features)

df = pd.DataFrame(importance.items(), columns=['feature', 'fscore'])
df.sort_values(by='fscore', ascending=True, inplace=True)
df['fscore'] = df['fscore'] / df['fscore'].sum()

plt.figure()

df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 15), grid=True, colormap='summer')
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')