In [None]:
import sys
import pandas as pd
import numpy as np
import scipy
from sklearn import preprocessing

sys.path.append('./src')


import myclassify
reload(myclassify)
from myclassify import MyFeatureSet
from myclassify import MyCountTable
from myclassify import merge_two_cat_columns
from myclassify import np_combine_rare

# to make reproducible results
SEED = 1234

In [None]:
# generate a bunch of feature sets
# feature set consists:
# Xtrain, train part of feature set
# Xtest, test part of feature set
# fname_list, feature names
# find_list, feature indices

# base feature set with' ROLE_ID' deleted and 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2' combined
BASE_FSET_FILE = './cache/base_fset.pickle'
class BaseFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        df_train = pd.read_csv('./train.csv')
        df_test = pd.read_csv('./test.csv')
        df_all = pd.concat([df_train.drop([u'ACTION'], axis = 1),df_test.drop([u'id'], axis = 1)], ignore_index=True)
        merge_two_cat_columns(df_all, 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2', 'ROLE_ROLLUP_12')
        col_keep = [u'RESOURCE', u'MGR_ID', u'ROLE_ROLLUP_12', u'ROLE_DEPTNAME', 
                    u'ROLE_TITLE', u'ROLE_FAMILY_DESC', u'ROLE_FAMILY']
        self.fname_list = col_keep
        self.find_list = range(len(col_keep)+1)
        df_all = df_all[col_keep]
        
        df_train = df_all[:][df_all.index<len(df_train.index)]
        df_test = df_all[:][df_all.index>=len(df_train.index)]
        
        self.Xtrain = df_train.values
        self.Xtest = df_test.values
        
        if file_path:
            self.save_feature_set(file_path)

        
base_fset = BaseFeatureSet()
%time base_fset.load_feature_set(BASE_FSET_FILE)
print base_fset.fname_list

In [None]:
# base feature set with one hot encoding
OHBASE_FSET_FILE = './cache/ohbase_fset.pickle'
class OHBaseFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        # load base_fset
        base_fset = BaseFeatureSet()
        self.Xtrain, self.Xtest = base_fset.fetch_feature_set(BASE_FSET_FILE)
        # label encoding
        lb_encoder = preprocessing.LabelEncoder()
        n_bf = len(base_fset.fname_list)
        for i in xrange(n_bf):
            lb_encoder.fit(np.hstack((self.Xtrain[:, i], self.Xtest[:, i])))
            self.Xtrain[:, i] = lb_encoder.transform(self.Xtrain[:,i])  
            self.Xtest[:, i] = lb_encoder.transform(self.Xtest[:, i])
        # one hot encoding
        oh_encoder = preprocessing.OneHotEncoder()
        oh_encoder.fit(np.vstack((self.Xtrain, self.Xtest)))
        self.Xtrain = oh_encoder.transform(self.Xtrain).tocsr()  
        self.Xtest = oh_encoder.transform(self.Xtest).tocsr()
        
        print type(self.Xtrain)
        
        self.fname_list = [u'OH_'+fname for fname in base_fset.fname_list]
        self.find_list = list(oh_encoder.feature_indices_)
        
        if file_path:
            self.save_feature_set(file_path)
        
ohbase_fset = OHBaseFeatureSet()
%time ohbase_fset.load_feature_set(OHBASE_FSET_FILE)
print ohbase_fset.fname_list
# print isinstance(ohbase_fset.Xtrain, scipy.sparse.csr_matrix)

In [None]:
# basic feature set with rare event combined and label encoded

CRBASE_FSET_FILE = './cache/crbase_fset.pickle'
class CRBaseFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        # generate from base_fset
        base_fset = BaseFeatureSet()
        self.Xtrain, self.Xtest = base_fset.fetch_feature_set(BASE_FSET_FILE)
        np_combine_rare(self.Xtrain, self.Xtest)
        self.fname_list = [u'CR_'+fname for fname in base_fset.fname_list]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
crbase_fset = CRBaseFeatureSet()
%time crbase_fset.load_feature_set(CRBASE_FSET_FILE)
print crbase_fset.fname_list
# print crbase_fset.find_list

In [None]:
# generate count tables 
# one cout table['feature_name'][key] = appaerance of key in 'feature_name'
# two cout table[('f_name1', 'f_name2'][key1][key2] = appaerance of (key1, key2) 
# two cout table[('f_name1', 'f_name2'][key1]['total'] = total number of unique key2 appear with key1
# to generate counting for feature i feature set:
# use count_table.fset_one_degree_counts(base_fset, i, COUNT_TABLE_FILE)
# to generate percentage of (key i , key j) apperance in all (key i, 'feature j') 
# use count_table.fset_one_degree_counts(base_fset, i, j, 'per',COUNT_TABLE_FILE)
# to generate unique number of (key i , key j) in all (key i, 'feature j') 
# use count_table.fset_one_degree_counts(base_fset, i, j, 'num',COUNT_TABLE_FILE)

COUNT_TABLE_FILE = './cache/count_tb.pickle'
count_table = MyCountTable()
# first time 
%time count_table.fset_generate_count_tables(base_fset, [], COUNT_TABLE_FILE)
# load previously generated
# %time count_table.load_count_tables(COUNT_TABLE_FILE)

In [None]:
# counting of base feature set
BASE_CNT_LS_FSET_FILE = './cache/basec_ls_fset.pickle'
class BaseCntLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        n_bf = len(base_fset.fname_list)
        n_train = base_fset.Xtrain.shape[0]
        n_test = base_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_bf), float)
        self.Xtest = np.zeros((n_test, n_bf), float)
        for i in xrange(n_bf):
            self.Xtrain[:, i], self.Xtest[:, i] = \
                count_table.fset_one_degree_counts(base_fset, i, COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'CNT_'+fname+u'_LS' for fname in base_fset.fname_list[1:]]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
basec_ls_fset = BaseCntLSFeatureSet()
%time basec_ls_fset.load_feature_set(BASE_CNT_LS_FSET_FILE)
print basec_ls_fset.fname_list

In [None]:
# percentage of certain resources used by other parties

RSRC_PER_LS_FSET_FILE = './cache/rsrcp_ls_fset.pickle'
class RsrcPerLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        n_bf = len(base_fset.fname_list)
        n_train = base_fset.Xtrain.shape[0]
        n_test = base_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_bf-1), float)
        self.Xtest = np.zeros((n_test, n_bf-1), float)
        for i in xrange(1, n_bf):
            self.Xtrain[:, i-1], self.Xtest[:, i-1] = \
                count_table.fset_two_degree_counts(base_fset, i, 0, 'per', COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'RSRC_PER_'+fname+u'_LS' for fname in base_fset.fname_list[1:]]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
rsrcp_ls_fset = RsrcPerLSFeatureSet()
%time rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
print rsrcp_ls_fset.fname_list

In [None]:
# different kinds of resources used by other parties
RSRC_NUM_LS_FSET_FILE = './cache/rsrcn_ls_fset.pickle'
class RsrcNumLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        n_bf = len(base_fset.fname_list)
        n_train = base_fset.Xtrain.shape[0]
        n_test = base_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_bf-1), float)
        self.Xtest = np.zeros((n_test, n_bf-1), float)
        for i in xrange(1, n_bf):
            self.Xtrain[:, i-1], self.Xtest[:, i-1] = \
                count_table.fset_two_degree_counts(base_fset, i, 0, 'num', COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'RSRC_NUM_'+fname+u'_LS' for fname in base_fset.fname_list[1:]]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
rsrcn_ls_fset = RsrcNumLSFeatureSet()
%time rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
print rsrcn_ls_fset.fname_list

In [None]:
# number of other parties used by certain manager
MGR_UNUM_LS_FSET_FILE = './cache/mgrun_ls_fset.pickle'
class MgrUNumLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        n_bf = len(base_fset.fname_list)
        n_train = base_fset.Xtrain.shape[0]
        n_test = base_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_bf-2), float)
        self.Xtest = np.zeros((n_test, n_bf-2), float)
        for i in xrange(2, n_bf):
            self.Xtrain[:, i-2], self.Xtest[:, i-2] = \
                count_table.fset_two_degree_counts(base_fset, 1, i, 'num', COUNT_TABLE_FILE)
            self.fname_list.append(u'MGR_UNUM_'+base_fset.fname_list[i]+u'_LS')
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
mgrun_ls_fset = MgrUNumLSFeatureSet()
%time mgrun_ls_fset.generate_feature_set(MGR_UNUM_LS_FSET_FILE)
print mgrun_ls_fset.fname_list

In [None]:
# number of other parties used by certain department
DEPT_UNUM_LS_FSET_FILE = './cache/deptun_ls_fset.pickle'
class DeptUNumLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        n_bf = len(base_fset.fname_list)
        n_train = base_fset.Xtrain.shape[0]
        n_test = base_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_bf-2), float)
        self.Xtest = np.zeros((n_test, n_bf-2), float)
        col = 0
        for i in xrange(1, n_bf):
            if i == 3:
                continue
            self.Xtrain[:, col], self.Xtest[:, col] = \
                count_table.fset_two_degree_counts(base_fset, 3, i, 'num', COUNT_TABLE_FILE)
            col += 1
            self.fname_list.append(u'DEPT_UNUM_'+base_fset.fname_list[i]+u'_LS')
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
deptun_ls_fset = DeptUNumLSFeatureSet()
%time deptun_ls_fset.load_feature_set(DEPT_UNUM_LS_FSET_FILE)
print deptun_ls_fset.fname_list

In [None]:
# base plus the two feature sets related with resources
BASIC_TREE_LS_FSET_FILE = './cache/btree_ls_fset.pickle'
class BasicTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set([base_fset, rsrcn_ls_fset, rsrcp_ls_fset])
        
        if file_path:
            self.save_feature_set(file_path)

btree_ls_fset = BasicTreeLSFeatureSet()
%time btree_ls_fset.load_feature_set(BASIC_TREE_LS_FSET_FILE)
print btree_ls_fset.fname_list

In [None]:
# base with rare event combined plus the two feature sets related with resources
CRBASIC_TREE_LS_FSET_FILE = './cache/crbtree_ls_fset.pickle'
class CRBasicTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        crbase_fset = CRBaseFeatureSet()
        crbase_fset.load_feature_set(CRBASE_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set([crbase_fset, rsrcn_ls_fset, rsrcp_ls_fset])
        
        if file_path:
            self.save_feature_set(file_path)

crbtree_ls_fset = CRBasicTreeLSFeatureSet()
%time crbtree_ls_fset.load_feature_set(CRBASIC_TREE_LS_FSET_FILE)
print crbtree_ls_fset.fname_list

In [None]:
# basic tree plus manager
BASICM_TREE_LS_FSET_FILE = './cache/bmtree_ls_fset.pickle'
class BasicMTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        mgrun_ls_fset = MgrUNumLSFeatureSet()
        mgrun_ls_fset.load_feature_set(MGR_UNUM_LS_FSET_FILE)
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set([base_fset, rsrcn_ls_fset, rsrcp_ls_fset, mgrun_ls_fset])
        
        if file_path:
            self.save_feature_set(file_path)

bmtree_ls_fset = BasicMTreeLSFeatureSet()
%time bmtree_ls_fset.load_feature_set(BASICM_TREE_LS_FSET_FILE)
print bmtree_ls_fset.fname_list

In [None]:
# basic tree plus manager and department
BASICMD_TREE_LS_FSET_FILE = './cache/bmdtree_ls_fset.pickle'
class BasicMDTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        mgrun_ls_fset = MgrUNumLSFeatureSet()
        mgrun_ls_fset.load_feature_set(MGR_UNUM_LS_FSET_FILE)
        deptun_ls_fset = DeptUNumLSFeatureSet()
        deptun_ls_fset.load_feature_set(DEPT_UNUM_LS_FSET_FILE)
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set([base_fset, rsrcn_ls_fset, rsrcp_ls_fset, mgrun_ls_fset, deptun_ls_fset])
        
        if file_path:
            self.save_feature_set(file_path)

bmdtree_ls_fset = BasicMDTreeLSFeatureSet()
%time bmdtree_ls_fset.load_feature_set(BASICMD_TREE_LS_FSET_FILE)
print bmdtree_ls_fset.fname_list

In [None]:
# basic tree plus manager , deparment and counting of base feature
BASICCMD_TREE_LS_FSET_FILE = './cache/bcmdtree_ls_fset.pickle'
class BasicCMDTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        base_fset = BaseFeatureSet()
        base_fset.load_feature_set(BASE_FSET_FILE)
        basec_ls_fset = BaseCntLSFeatureSet()
        basec_ls_fset.load_feature_set(BASE_CNT_LS_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        mgrun_ls_fset = MgrUNumLSFeatureSet()
        mgrun_ls_fset.load_feature_set(MGR_UNUM_LS_FSET_FILE)
        deptun_ls_fset = DeptUNumLSFeatureSet()
        deptun_ls_fset.load_feature_set(DEPT_UNUM_LS_FSET_FILE)
        
        fset_list = [base_fset, basec_ls_fset, rsrcn_ls_fset, rsrcp_ls_fset, mgrun_ls_fset, deptun_ls_fset]
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set(fset_list)
        
        if file_path:
            self.save_feature_set(file_path)

bcmdtree_ls_fset = BasicCMDTreeLSFeatureSet()
%time bcmdtree_ls_fset.load_feature_set(BASICCMD_TREE_LS_FSET_FILE)
print bcmdtree_ls_fset.fname_list

In [None]:
# one hot version of the above, can be used fro logistic
OHBASICCMD_TREE_LS_FSET_FILE = './cache/ohbcmdlr_ls_fset.pickle'
class OHBasicCMDTreeLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        ohbase_fset = OHBaseFeatureSet()
        ohbase_fset.load_feature_set(OHBASE_FSET_FILE)
        basec_ls_fset = BaseCntLSFeatureSet()
        basec_ls_fset.load_feature_set(BASE_CNT_LS_FSET_FILE)
        rsrcn_ls_fset = RsrcNumLSFeatureSet()
        rsrcn_ls_fset.load_feature_set(RSRC_NUM_LS_FSET_FILE)
        rsrcp_ls_fset = RsrcPerLSFeatureSet()
        rsrcp_ls_fset.load_feature_set(RSRC_PER_LS_FSET_FILE)
        mgrun_ls_fset = MgrUNumLSFeatureSet()
        mgrun_ls_fset.load_feature_set(MGR_UNUM_LS_FSET_FILE)
        deptun_ls_fset = DeptUNumLSFeatureSet()
        deptun_ls_fset.load_feature_set(DEPT_UNUM_LS_FSET_FILE)
        
        fset_list = [ohbase_fset, basec_ls_fset, rsrcn_ls_fset, rsrcp_ls_fset, mgrun_ls_fset, deptun_ls_fset]
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set(fset_list)
        
        if file_path:
            self.save_feature_set(file_path)

ohbcmdtree_ls_fset = OHBasicCMDTreeLSFeatureSet()
%time ohbcmdtree_ls_fset.load_feature_set(OHBASICCMD_TREE_LS_FSET_FILE)
print ohbcmdtree_ls_fset.fname_list

In [None]:
# generate ytrain and idtest
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

ytrain = df_train[u'ACTION'].values
idtest = df_test[u'id'].values


In [None]:
# generate xgb prediction using bcmdtree_ls_fset
myxgb_params = {'objective': 'binary:logistic', 'subsample': .9, 'nthread': 4, 'seed': SEED, 'num_round':1000,
                   'learning_rate': 0.03, 'n_estimators': 1000, 'colsample_bylevel':0.7, 
                   'max_depth': 20,'gamma': 0.6, 'colsample_bytree':0.85, 'min_child_weight':0.,
                      'lambda': 0.8, 'alpha': 0}

myxgb = myclassify.MyXGBoost(myxgb_params)

myxgb.fit(bcmdtree_ls_fset.Xtrain, ytrain)

myxgb_ypred = myxgb.predict_proba(bcmdtree_ls_fset.Xtest)

submission = pd.DataFrame({"id":idtest, "ACTION":myxgb_ypred})
submission = submission[['id', 'ACTION']]
submission.to_csv("xgb_submission.csv", index=False)

In [None]:
# generate logistic regression prediction using ohbcmdtree_ls_fset
mylr_params = {'C': 2., 'n_jobs':-1, 'penalty':'l2', 
               'solver':'liblinear', 'max_iter':1000 , 'tol':1e-10, 'random_state':SEED, 'verbose':0}

mylr = myclassify.MyLogisticReg(mylr_params)

mylr.fit(ohbcmdtree_ls_fset.Xtrain, ytrain)

mylr_ypred = mylr.predict_proba(ohbcmdtree_ls_fset.Xtest)

submission = pd.DataFrame({"id":idtest, "ACTION":mylr_ypred})
submission = submission[['id', 'ACTION']]
submission.to_csv("lr_submission.csv", index=False)