In [2]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from xgboost.sklearn import XGBClassifier
from PreprocessingScripts import *

# import data from files
act_train_data = pd.read_csv("c:/ml/redhat/input/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("c:/ml/redhat/input/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("c:/ml/redhat/input/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

# drop characteristic 10 from activity files because there is no data
act_train_data=act_train_data.drop('char_10',axis=1)
act_test_data=act_test_data.drop('char_10',axis=1)

print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

# massage data so that it's easier to work with and analyze
act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

# merge activity file with people file using people_id key
train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

del act_train_data
del act_test_data
del people_data

# sort data by people_id, [1] has one value of 1, means True for people_id
train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

y = train.outcome
train=train.drop('outcome',axis=1)

# concat train and test on row. The resulting axis, which is row, will be labeled 0, ..., n - 1.
# combine the two datasets to perform my own train test split
whole=pd.concat([train,test],ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)
    
X=whole[:len(train)]
X_test=whole[len(train):]

del train
del whole
    
X=X.sort_values(['people_id'], ascending=[1])

# drop non-feature columns
X = X[features].drop(['people_id', 'activity_id'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id'], axis = 1)

categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)
        


# split X,y into training and validation set
from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

# convert categorical columns to numerical
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical], X_test[categorical]]))
X_train_cat_sparse=enc.transform(X_train[categorical])
X_val_cat_sparse=enc.transform(X_val[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

parameters = {'nthread':[4], 
              'objective':['binary:logistic'], 
              'max_depth': [10,11,12],
              'min_child_weight': [0],
              'silent': [1],
              'subsample': [.8],
              'colsample_bytree': [.8]
             }

gbm = XGBClassifier()

clf = GridSearchCV(estimator=gbm, 
                   param_grid=parameters, 
                   n_jobs=5, 
                   cv=StratifiedKFold(y_train, n_folds=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, 
                   refit=True
                  )

clf.fit(X_train_cat_sparse,y_train)

#trust your CV!
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

test_probs = clf.predict_proba(X_train_cat_sparse)[:,1]
print test_probs

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed: 40.6min finished


('Raw AUC score:', 0.94993942464905612)
colsample_bytree: 0.8
max_depth: 11
min_child_weight: 0
nthread: 4
objective: 'binary:logistic'
silent: 1
subsample: 0.8
[  8.97492349e-01   7.67945886e-01   5.38291574e-01 ...,   3.64481471e-04
   9.15858865e-01   3.64481471e-04]
