### XGBoost

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from time import time



In [2]:
act_train_data = pd.read_csv("./act_train.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, 
                             parse_dates=['date'])
act_test_data  = pd.read_csv("./act_test.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("./people.csv", 
                             dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, 
                             parse_dates=['date'])

In [3]:
act_train_data[:3]

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0


In [4]:
act_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197291 entries, 0 to 2197290
Data columns (total 15 columns):
people_id            object
activity_id          object
date                 datetime64[ns]
activity_category    object
char_1               object
char_2               object
char_3               object
char_4               object
char_5               object
char_6               object
char_7               object
char_8               object
char_9               object
char_10              object
outcome              int8
dtypes: datetime64[ns](1), int8(1), object(13)
memory usage: 236.8+ MB


In [5]:
people_data[:3]

Unnamed: 0,people_id,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,...,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,...,False,True,True,False,False,True,True,True,False,36
1,ppl_100002,type 2,group 8688,type 3,2021-01-06,type 28,type 9,type 5,type 3,type 11,...,False,True,True,True,True,True,True,True,False,76
2,ppl_100003,type 2,group 33592,type 3,2022-06-10,type 4,type 8,type 5,type 2,type 5,...,False,False,True,True,True,True,False,True,True,99


In [6]:
people_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189118 entries, 0 to 189117
Data columns (total 41 columns):
people_id    189118 non-null object
char_1       189118 non-null object
group_1      189118 non-null object
char_2       189118 non-null object
date         189118 non-null datetime64[ns]
char_3       189118 non-null object
char_4       189118 non-null object
char_5       189118 non-null object
char_6       189118 non-null object
char_7       189118 non-null object
char_8       189118 non-null object
char_9       189118 non-null object
char_10      189118 non-null bool
char_11      189118 non-null bool
char_12      189118 non-null bool
char_13      189118 non-null bool
char_14      189118 non-null bool
char_15      189118 non-null bool
char_16      189118 non-null bool
char_17      189118 non-null bool
char_18      189118 non-null bool
char_19      189118 non-null bool
char_20      189118 non-null bool
char_21      189118 non-null bool
char_22      189118 non-null bool
char_23

In [7]:
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)


In [8]:
def reduce_dim(dataset, column):
    # summarize those showing only once to one category
    for index, dup in dataset[column].duplicated(keep=False).iteritems():
        if dup == False:
            dataset.set_value(index, column, -1)
    # re-index
    new_index = {idx:i for i, idx in enumerate(dataset[column].unique())}
    dataset[column] = dataset[column].apply(lambda x: new_index[x])
    return dataset
    
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                # regard NA as a category
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                # change binary feature to type int (0/1)
                dataset[col] = dataset[col].astype(np.int8)
    
    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day
    dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype('int8')
    dataset = dataset.drop('date', axis=1)
    
    return dataset

In [9]:
act_train_data = act_train_data.drop('char_10', axis=1)
act_test_data = act_test_data.drop('char_10', axis=1)

print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


In [10]:
act_train_data = act_data_treatment(act_train_data)
act_test_data  = act_data_treatment(act_test_data)
people_data    = act_data_treatment(people_data)

In [11]:
train = act_train_data.merge(people_data, on='people_id', how='left')
test  = act_test_data.merge(people_data, on='people_id', how='left')
act_id = act_test_data['activity_id']

del act_train_data, act_test_data, people_data

In [12]:
print("Train data shape: " + format(train.shape))
print("Test data shape: " + format(test.shape))

Train data shape: (2197291, 60)
Test data shape: (498687, 59)


In [13]:
train = train.drop(['people_id', 'activity_id'], axis = 1)
test = test.drop(['people_id', 'activity_id'], axis = 1)

print("Train data shape: " + format(train.shape))
print("Test data shape: " + format(test.shape))

Train data shape: (2197291, 58)
Test data shape: (498687, 57)


In [14]:
{col:len(train[col].unique()) for col in train.columns}

{'activity_category': 7,
 'char_10': 2,
 'char_11': 2,
 'char_12': 2,
 'char_13': 2,
 'char_14': 2,
 'char_15': 2,
 'char_16': 2,
 'char_17': 2,
 'char_18': 2,
 'char_19': 2,
 'char_1_x': 52,
 'char_1_y': 2,
 'char_20': 2,
 'char_21': 2,
 'char_22': 2,
 'char_23': 2,
 'char_24': 2,
 'char_25': 2,
 'char_26': 2,
 'char_27': 2,
 'char_28': 2,
 'char_29': 2,
 'char_2_x': 33,
 'char_2_y': 3,
 'char_30': 2,
 'char_31': 2,
 'char_32': 2,
 'char_33': 2,
 'char_34': 2,
 'char_35': 2,
 'char_36': 2,
 'char_37': 2,
 'char_38': 101,
 'char_3_x': 12,
 'char_3_y': 43,
 'char_4_x': 8,
 'char_4_y': 25,
 'char_5_x': 8,
 'char_5_y': 9,
 'char_6_x': 6,
 'char_6_y': 7,
 'char_7_x': 9,
 'char_7_y': 25,
 'char_8_x': 19,
 'char_8_y': 8,
 'char_9_x': 20,
 'char_9_y': 9,
 'day_x': 31,
 'day_y': 31,
 'group_1': 29899,
 'isweekend_x': 2,
 'isweekend_y': 2,
 'month_x': 12,
 'month_y': 12,
 'outcome': 2,
 'year_x': 2,
 'year_y': 4}

In [15]:
y = train['outcome']
train = train.drop('outcome',axis=1)

In [16]:
whole = pd.concat([train, test], ignore_index=True)
categorical = ['group_1', 'activity_category', 'char_1_x', 'char_2_x', 'char_3_x', 'char_4_x', 'char_5_x', 
               'char_6_x', 'char_7_x', 'char_8_x', 'char_9_x', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y',
               'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y']
t0 = time()
for category in categorical:
    whole = reduce_dim(whole, category)
print "Elapsed time %.2f seconds for cleaning categorical features." %(time()-t0)

X = whole[:(len(train))]
X_test = whole[len(train):]
mask = np.random.rand(len(train)) < 0.2
X_val, X = X[mask], X[~mask]
y_val, y = y[mask], y[~mask]

print("Train data shape: " + format(X.shape))
print("Validation data shape: " + format(X_val.shape))
print("Test data shape: " + format(X_test.shape))

Elapsed time 93.65 seconds for cleaning categorical features.
Train data shape: (1757203, 57)
Validation data shape: (440088, 57)
Test data shape: (498687, 57)


In [17]:
del train
del test
del whole

not_categorical = []
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)

### Linear function based

In [18]:
from sklearn.preprocessing import OneHotEncoder

print "One-hot encoding and transforming to sparse..."
t0 = time()
enc = OneHotEncoder(handle_unknown='ignore')
enc = enc.fit(pd.concat([X[categorical], X_val[categorical], X_test[categorical]]))
X_cat_sparse = enc.transform(X[categorical])
X_val_cat_sparse = enc.transform(X_val[categorical])
X_test_cat_sparse = enc.transform(X_test[categorical])

from scipy.sparse import hstack
X_sparse = hstack((X[not_categorical], X_cat_sparse))
X_val_sparse = hstack((X_val[not_categorical], X_val_cat_sparse))
X_test_sparse = hstack((X_test[not_categorical], X_test_cat_sparse))
print "Elapsed time: %.2f seconds." %(time()-t0)

print("Training data: " + format(X_sparse.shape))
print("Validating data: " + format(X_val_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
del X, X_val, X_test, X_cat_sparse, X_val_cat_sparse, X_test_cat_sparse

One-hot encoding and transforming to sparse...
Elapsed time: 58.45 seconds.
Training data: (1757203, 31271)
Validating data: (440088, 31271)
Test data: (498687, 31271)


In [19]:
print "XGBoost initialing..."
t0 = time()
dtrain = xgb.DMatrix(X_sparse, label=y)
dval = xgb.DMatrix(X_val_sparse, label=y_val)
dtest = xgb.DMatrix(X_test_sparse)
print "Initialized. Elapsed time: %.2f seconds." %(time()-t0)

XGBoost initialing...
Initialized. Elapsed time: 125.78 seconds.


In [20]:
class AucCall(object):
    def __init__(self, lr_decay):
        self.train_auc = []
        self.valid_auc = []
        self.lr_decay = lr_decay
        self.best_val_auc = 0.0
    def __call__(self, env):
        # Record the evaluation and Save the best model
        tr_auc = dict(env.evaluation_result_list)['train-auc']
        self.train_auc.append(tr_auc)
        val_auc = dict(env.evaluation_result_list)['validation-auc']
        self.valid_auc.append(val_auc)
        if val_auc > self.best_val_auc:
            print "The BEST val_auc until now. Saving model..."
            self.best_val_auc = val_auc
            env.model.save_model('./best_redhat.model')
        new_lr_rate = self.rateDecay(env.iteration)
        env.model.set_param('eta', new_lr_rate)
    def rateDecay(self, iter_round):
        # Set learning rate decay
        return param['eta'] / (1 + self.lr_decay * iter_round)

In [21]:
param = {'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
#param['lambda'] = 0.01  # l2 regularization
#param['alpha'] = 0.1   # l1 regularization
param['booster'] = "gblinear"

watchlist  = [(dval,'validation'), (dtrain,'train')]
num_round = 400
early_stopping_rounds = 15

In [22]:
print "Start training..."
t0 = time()
metricRecords = AucCall(0.)
bst = xgb.train(param, dtrain, num_round, watchlist, 
                early_stopping_rounds=early_stopping_rounds, callbacks=[metricRecords])
print "\nTraining complete!"
print "Elapsed time: %.2f seconds." %(time()-t0)

Start training...
The BEST val_auc until now. Saving model...
[0]	validation-auc:0.974437	train-auc:0.975055
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 15 rounds.
The BEST val_auc until now. Saving model...
[1]	validation-auc:0.990906	train-auc:0.991379
The BEST val_auc until now. Saving model...
[2]	validation-auc:0.994154	train-auc:0.994606
The BEST val_auc until now. Saving model...
[3]	validation-auc:0.99544	train-auc:0.995907
The BEST val_auc until now. Saving model...
[4]	validation-auc:0.996051	train-auc:0.996535
The BEST val_auc until now. Saving model...
[5]	validation-auc:0.99637	train-auc:0.996861
The BEST val_auc until now. Saving model...
[6]	validation-auc:0.996543	train-auc:0.997031
The BEST val_auc until now. Saving model...
[7]	validation-auc:0.996639	train-auc:0.997122
The BEST val_auc until now. Saving model...
[8]	validation-auc:0.996695	train-auc:0.997174
The BEST val_auc until

In [84]:
from IPython.lib.display import FileLink

best = xgb.Booster(param)
best.load_model('./best_redhat.model')
ypred = best.predict(dtest)
output = pd.DataFrame({ 'activity_id': act_id, 'outcome': ypred })
output.head()
output.to_csv('eval_redhat.csv', index=False)
FileLink('eval_redhat.csv')

### Tree-based

In [85]:
class AucCall(object):
    def __init__(self, lr_decay):
        self.train_auc = []
        self.valid_auc = []
        self.lr_decay = lr_decay
        self.best_val_auc = 0.0
    def __call__(self, env):
        # Record the evaluation and Save the best model
        tr_auc = dict(env.evaluation_result_list)['train-auc']
        self.train_auc.append(tr_auc)
        val_auc = dict(env.evaluation_result_list)['validation-auc']
        self.valid_auc.append(val_auc)
        if val_auc > self.best_val_auc:
            print "The BEST val_auc until now. Saving model..."
            self.best_val_auc = val_auc
            env.model.save_model('./best_gbtree.model')

In [86]:
param = {'max_depth': 10, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.8
param['colsample_bytree']= 0.8
#param['colsample_bylevel']= 0.9
#param['gamma'] = 0.01  # regularization of the term, gamma * T, T: number of leaves
#param['min_child_weight'] = 100
param['booster'] = "gbtree"

watchlist  = [(dval,'validation'), (dtrain,'train')]
num_round = 1000

In [87]:
print "Start training..."
t0 = time()
metricRecords = AucCall(0.)
bst = xgb.train(param, dtrain, num_round, watchlist, callbacks=[metricRecords])
print "\nTraining complete!"
print "Elapsed time: %.2f seconds." %(time()-t0)

Start training...
The BEST val_auc until now. Saving model...
[0]	validation-auc:0.921429	train-auc:0.921278
The BEST val_auc until now. Saving model...
[1]	validation-auc:0.928857	train-auc:0.92893
The BEST val_auc until now. Saving model...
[2]	validation-auc:0.931363	train-auc:0.931352
The BEST val_auc until now. Saving model...
[3]	validation-auc:0.933807	train-auc:0.933864
The BEST val_auc until now. Saving model...
[4]	validation-auc:0.934527	train-auc:0.934707
The BEST val_auc until now. Saving model...
[5]	validation-auc:0.937011	train-auc:0.937166
The BEST val_auc until now. Saving model...
[6]	validation-auc:0.938585	train-auc:0.938713
The BEST val_auc until now. Saving model...
[7]	validation-auc:0.9412	train-auc:0.941525
The BEST val_auc until now. Saving model...
[8]	validation-auc:0.942576	train-auc:0.942973
The BEST val_auc until now. Saving model...
[9]	validation-auc:0.94316	train-auc:0.943605
The BEST val_auc until now. Saving model...
[10]	validation-auc:0.944149	tra

In [88]:
from IPython.lib.display import FileLink

best = xgb.Booster(param)
best.load_model('./best_gbtree.model')
ypred = best.predict(dtest)
output = pd.DataFrame({ 'activity_id': act_id, 'outcome': ypred })
output.head()
output.to_csv('eval_redhat.csv', index=False)
FileLink('eval_redhat.csv')

### Scikit-learn

In [23]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, max_depth=10)

%timeit clf.fit(X_sparse, y)

1 loop, best of 3: 1min 30s per loop


In [27]:
from sklearn.metrics import roc_auc_score
pred_score = clf.predict_proba(X_sparse)
roc_auc_score(y, pred_score[:,1])

0.8815852605028025

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=50, max_depth=5)

t0 = time(); clf.fit(X_sparse, y); time()-t0

KeyboardInterrupt: 

In [27]:
from sklearn.metrics import roc_auc_score
pred_score = clf.predict_proba(X_sparse)
roc_auc_score(y, pred_score[:,1])

0.8815852605028025