In [33]:
import sys
import pandas as pd
import numpy as np
import scipy
from sklearn import preprocessing

sys.path.append('./src')


import myclassify
reload(myclassify)
from myclassify import MyFeatureSet
from myclassify import MyCountTable
from myclassify import merge_two_cat_columns
from myclassify import np_combine_rare

# to make reproducible results
SEED = 1234

# generate a bunch of feature sets
# feature set consists:
# Xtrain, train part of feature set
# Xtest, test part of feature set
# fname_list, feature names
# find_list, feature indices

In [39]:
reload(myclassify)

# original feature set with' ROLE_ID' deleted, will be used to generated numerical features
ORIG_FSET_FILE = './cache/orig_fset.pickle'
class OrigFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        df_train = pd.read_csv('./train.csv')
        df_test = pd.read_csv('./test.csv')
        df_all = pd.concat([df_train.drop([u'ACTION'], axis = 1),df_test.drop([u'id'], axis = 1)], ignore_index=True)
        col_keep = [u'RESOURCE', u'MGR_ID', u'ROLE_DEPTNAME', 
                    u'ROLE_TITLE', u'ROLE_FAMILY_DESC', u'ROLE_FAMILY', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2']
        self.fname_list = col_keep
        self.find_list = range(len(col_keep)+1)
        df_all = df_all[col_keep]        
        df_train = df_all[:][df_all.index<len(df_train.index)]
        df_test = df_all[:][df_all.index>=len(df_train.index)]
        
        self.Xtrain = df_train.values
        self.Xtest = df_test.values
        
        if file_path:
            self.save_feature_set(file_path)

        
orig_fset = OrigFeatureSet()
%time orig_fset.generate_feature_set(ORIG_FSET_FILE)
print orig_fset.fname_list

CPU times: user 159 ms, sys: 4 ms, total: 163 ms
Wall time: 163 ms
[u'RESOURCE', u'MGR_ID', u'ROLE_DEPTNAME', u'ROLE_TITLE', u'ROLE_FAMILY_DESC', u'ROLE_FAMILY', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2']


In [41]:
reload(myclassify)

# original feature set with' ROLE_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2'deleted
ORIGR_FSET_FILE = './cache/origr_fset.pickle'
class OrigRFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        df_train = pd.read_csv('./train.csv')
        df_test = pd.read_csv('./test.csv')
        df_all = pd.concat([df_train.drop([u'ACTION'], axis = 1),df_test.drop([u'id'], axis = 1)], ignore_index=True)
        col_keep = [u'RESOURCE', u'MGR_ID', u'ROLE_DEPTNAME', 
                    u'ROLE_TITLE', u'ROLE_FAMILY_DESC', u'ROLE_FAMILY']
        self.fname_list = col_keep
        self.find_list = range(len(col_keep)+1)
        df_all = df_all[col_keep]
        
        df_train = df_all[:][df_all.index<len(df_train.index)]
        df_test = df_all[:][df_all.index>=len(df_train.index)]
        
        self.Xtrain = df_train.values
        self.Xtest = df_test.values
        
        if file_path:
            self.save_feature_set(file_path)

        
origr_fset = OrigRFeatureSet()
%time origr_fset.generate_feature_set(ORIGR_FSET_FILE)
print origr_fset.fname_list

CPU times: user 122 ms, sys: 7.96 ms, total: 130 ms
Wall time: 129 ms
[u'RESOURCE', u'MGR_ID', u'ROLE_DEPTNAME', u'ROLE_TITLE', u'ROLE_FAMILY_DESC', u'ROLE_FAMILY']


In [42]:
reload(myclassify)

# reduced original feature set
# original feature set with' ROLE_ID', 'ROLE_ROLLUP_1', 'ROLE_ROLLUP_2'deleted and rare events combined
CR_ORIGR_FSET_FILE = './cache/crorigr_fset.pickle'
class CROrigRFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        origr_fset = OrigRFeatureSet()
        origr_fset.load_feature_set(ORIGR_FSET_FILE)
        self.Xtrain = origr_fset.Xtrain
        self.Xtest = origr_fset.Xtest
        np_combine_rare(self.Xtrain, self.Xtest)
        self.fname_list = [u'CR_'+fname for fname in origr_fset.fname_list]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
crorigr_fset = CROrigRFeatureSet()
%time crorigr_fset.generate_feature_set(CR_ORIGR_FSET_FILE)
print crorigr_fset.fname_list

CPU times: user 420 ms, sys: 4 ms, total: 424 ms
Wall time: 424 ms
[u'CR_RESOURCE', u'CR_MGR_ID', u'CR_ROLE_DEPTNAME', u'CR_ROLE_TITLE', u'CR_ROLE_FAMILY_DESC', u'CR_ROLE_FAMILY']


In [43]:
reload(myclassify)

# reduced original feature set with one hot encoding
OHCR_ORIGR_FSET_FILE = './cache/ohcrorigr_fset.pickle'
class OHCROrigRFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        crorigr_fset = CROrigRFeatureSet()
        crorigr_fset.load_feature_set(CR_ORIGR_FSET_FILE)
        self.Xtrain = crorigr_fset.Xtrain
        self.Xtest = crorigr_fset.Xtest
        # label encoding
        lb_encoder = preprocessing.LabelEncoder()
        n_orf = len(origr_fset.fname_list)
        for i in xrange(n_orf):
            lb_encoder.fit(np.hstack((self.Xtrain[:, i], self.Xtest[:, i])))
            self.Xtrain[:, i] = lb_encoder.transform(self.Xtrain[:,i])  
            self.Xtest[:, i] = lb_encoder.transform(self.Xtest[:, i])
        # one hot encoding
        oh_encoder = preprocessing.OneHotEncoder()
        oh_encoder.fit(np.vstack((self.Xtrain, self.Xtest)))
        self.Xtrain = oh_encoder.transform(self.Xtrain).tocsr()  
        self.Xtest = oh_encoder.transform(self.Xtest).tocsr()
        
        print type(self.Xtrain)
        
        self.fname_list = [u'OH_'+fname for fname in crorigr_fset.fname_list]
        self.find_list = list(oh_encoder.feature_indices_)
        
        if file_path:
            self.save_feature_set(file_path)
        
ohcrorigr_fset = OHCROrigRFeatureSet()
%time ohcrorigr_fset.generate_feature_set(OHCR_ORIGR_FSET_FILE)
print ohcrorigr_fset.fname_list
# print ohcrorigr_fset.find_list

<class 'scipy.sparse.csr.csr_matrix'>
CPU times: user 154 ms, sys: 12 ms, total: 166 ms
Wall time: 166 ms
[u'OH_CR_RESOURCE', u'OH_CR_MGR_ID', u'OH_CR_ROLE_DEPTNAME', u'OH_CR_ROLE_TITLE', u'OH_CR_ROLE_FAMILY_DESC', u'OH_CR_ROLE_FAMILY']


In [45]:
reload(myclassify)

# tuple feature set generated from orig_fset
ORIG_TUPLE_FSET_FILE = './cache/orig_tuple_fset.pickle'
class OrigTupleFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_fset = OrigFeatureSet()
        orig_fset.generate_feature_set(ORIG_FSET_FILE)
        
        n_of = len(orig_fset.fname_list)
        n_train = orig_fset.Xtrain.shape[0]
        n_test = orig_fset.Xtest.shape[0]
        
        for i in xrange(n_of):
            i_ind = orig_fset.find_list[i]
            i_name = orig_fset.fname_list[i]
            i_data = np.hstack((orig_fset.Xtrain[:, i_ind], orig_fset.Xtest[:, i_ind]))
            unique_i = np.unique(i_data).shape[0]
            for j in xrange(i+1, n_of):
                j_ind = orig_fset.find_list[j]
                j_name = orig_fset.fname_list[j]
                j_data = np.hstack((orig_fset.Xtrain[:, j_ind], orig_fset.Xtest[:, j_ind]))
                unique_j = np.unique(j_data).shape[0]
                
#                 myclassify.fset_check_two_columns(orig_fset, i, j)
                
                ij_train, ij_test = myclassify.fset_merge_two_cat_columns(orig_fset, (i,j))
                ij_data = np.hstack((ij_train, ij_test))
                unique_ij = np.unique(ij_data).shape[0]
                
                if unique_ij > unique_i and unique_ij > unique_j: 
                    if len(self.fname_list) == 0:
                        self.Xtrain = ij_train.reshape(n_train, 1)
                        self.Xtest = ij_test.reshape(n_test, 1)
                    else:
                        self.Xtrain = np.hstack((self.Xtrain, ij_train.reshape(n_train, 1)))
                        self.Xtest = np.hstack((self.Xtest, ij_test.reshape(n_test, 1)))
                    ij_name = i_name + u'_AND_' + j_name
                    self.fname_list.append(ij_name)
                           
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)

        
orig_tuple_fset = OrigTupleFeatureSet()
%time orig_tuple_fset.generate_feature_set(ORIG_TUPLE_FSET_FILE)
print orig_tuple_fset.fname_list
print orig_tuple_fset.Xtrain.shape

CPU times: user 2.73 s, sys: 32 ms, total: 2.77 s
Wall time: 3 s
[u'RESOURCE_AND_MGR_ID', u'RESOURCE_AND_ROLE_DEPTNAME', u'RESOURCE_AND_ROLE_TITLE', u'RESOURCE_AND_ROLE_FAMILY_DESC', u'RESOURCE_AND_ROLE_FAMILY', u'RESOURCE_AND_ROLE_ROLLUP_1', u'RESOURCE_AND_ROLE_ROLLUP_2', u'MGR_ID_AND_ROLE_DEPTNAME', u'MGR_ID_AND_ROLE_TITLE', u'MGR_ID_AND_ROLE_FAMILY_DESC', u'MGR_ID_AND_ROLE_FAMILY', u'MGR_ID_AND_ROLE_ROLLUP_1', u'MGR_ID_AND_ROLE_ROLLUP_2', u'ROLE_DEPTNAME_AND_ROLE_TITLE', u'ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC', u'ROLE_DEPTNAME_AND_ROLE_FAMILY', u'ROLE_DEPTNAME_AND_ROLE_ROLLUP_1', u'ROLE_DEPTNAME_AND_ROLE_ROLLUP_2', u'ROLE_TITLE_AND_ROLE_FAMILY_DESC', u'ROLE_TITLE_AND_ROLE_ROLLUP_1', u'ROLE_TITLE_AND_ROLE_ROLLUP_2', u'ROLE_FAMILY_DESC_AND_ROLE_FAMILY', u'ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_1', u'ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_2', u'ROLE_FAMILY_AND_ROLE_ROLLUP_1', u'ROLE_FAMILY_AND_ROLE_ROLLUP_2', u'ROLE_ROLLUP_1_AND_ROLE_ROLLUP_2']
(32769, 27)


In [46]:
reload(myclassify)

# tuple feature set generated from orig_fset with rare events combined
CR_ORIG_TUPLE_FSET_FILE = './cache/crorig_tuple_fset.pickle'
class CROrigTupleFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_tuple_fset = OrigTupleFeatureSet()
        orig_tuple_fset.load_feature_set(ORIG_TUPLE_FSET_FILE)
        self.Xtrain = orig_tuple_fset.Xtrain
        self.Xtest = orig_tuple_fset.Xtest
        np_combine_rare(self.Xtrain, self.Xtest)
        self.fname_list = [u'CR_'+fname for fname in orig_tuple_fset.fname_list]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
crorig_tuple_fset = CROrigTupleFeatureSet()
%time crorig_tuple_fset.generate_feature_set(CR_ORIG_TUPLE_FSET_FILE)
print crorig_tuple_fset.fname_list

CPU times: user 15.8 s, sys: 19.9 ms, total: 15.8 s
Wall time: 15.8 s
[u'CR_RESOURCE_AND_MGR_ID', u'CR_RESOURCE_AND_ROLE_DEPTNAME', u'CR_RESOURCE_AND_ROLE_TITLE', u'CR_RESOURCE_AND_ROLE_FAMILY_DESC', u'CR_RESOURCE_AND_ROLE_FAMILY', u'CR_RESOURCE_AND_ROLE_ROLLUP_1', u'CR_RESOURCE_AND_ROLE_ROLLUP_2', u'CR_MGR_ID_AND_ROLE_DEPTNAME', u'CR_MGR_ID_AND_ROLE_TITLE', u'CR_MGR_ID_AND_ROLE_FAMILY_DESC', u'CR_MGR_ID_AND_ROLE_FAMILY', u'CR_MGR_ID_AND_ROLE_ROLLUP_1', u'CR_MGR_ID_AND_ROLE_ROLLUP_2', u'CR_ROLE_DEPTNAME_AND_ROLE_TITLE', u'CR_ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC', u'CR_ROLE_DEPTNAME_AND_ROLE_FAMILY', u'CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_1', u'CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_2', u'CR_ROLE_TITLE_AND_ROLE_FAMILY_DESC', u'CR_ROLE_TITLE_AND_ROLE_ROLLUP_1', u'CR_ROLE_TITLE_AND_ROLE_ROLLUP_2', u'CR_ROLE_FAMILY_DESC_AND_ROLE_FAMILY', u'CR_ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_1', u'CR_ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_2', u'CR_ROLE_FAMILY_AND_ROLE_ROLLUP_1', u'CR_ROLE_FAMILY_AND_ROLE_ROLLUP_2', u'CR_ROL

In [47]:
reload(myclassify)

# reduced original feature set with one hot encoding
OHCR_ORIG_TUPLE_FSET_FILE = './cache/ohcrorig_tuple_fset.pickle'
class OHCROrigTupleFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        crorig_tuple_fset = CROrigTupleFeatureSet()
        crorig_tuple_fset.load_feature_set(CR_ORIG_TUPLE_FSET_FILE)
        self.Xtrain = crorig_tuple_fset.Xtrain
        self.Xtest = crorig_tuple_fset.Xtest
        # label encoding
        lb_encoder = preprocessing.LabelEncoder()
        n_orf = len(origr_fset.fname_list)
        for i in xrange(n_orf):
            lb_encoder.fit(np.hstack((self.Xtrain[:, i], self.Xtest[:, i])))
            self.Xtrain[:, i] = lb_encoder.transform(self.Xtrain[:,i])  
            self.Xtest[:, i] = lb_encoder.transform(self.Xtest[:, i])
        # one hot encoding
        oh_encoder = preprocessing.OneHotEncoder()
        oh_encoder.fit(np.vstack((self.Xtrain, self.Xtest)))
        self.Xtrain = oh_encoder.transform(self.Xtrain).tocsr()  
        self.Xtest = oh_encoder.transform(self.Xtest).tocsr()
        
        print type(self.Xtrain)
        
        self.fname_list = [u'OH_'+fname for fname in crorig_tuple_fset.fname_list]
        self.find_list = list(oh_encoder.feature_indices_)
        
        if file_path:
            self.save_feature_set(file_path)
        
ohcrorig_tuple_fset = OHCROrigTupleFeatureSet()
%time ohcrorig_tuple_fset.generate_feature_set(OHCR_ORIG_TUPLE_FSET_FILE)
print ohcrorig_tuple_fset.fname_list
# print ohcrorig_tuple_fset.find_list

<class 'scipy.sparse.csr.csr_matrix'>
CPU times: user 585 ms, sys: 100 ms, total: 685 ms
Wall time: 696 ms
[u'OH_CR_RESOURCE_AND_MGR_ID', u'OH_CR_RESOURCE_AND_ROLE_DEPTNAME', u'OH_CR_RESOURCE_AND_ROLE_TITLE', u'OH_CR_RESOURCE_AND_ROLE_FAMILY_DESC', u'OH_CR_RESOURCE_AND_ROLE_FAMILY', u'OH_CR_RESOURCE_AND_ROLE_ROLLUP_1', u'OH_CR_RESOURCE_AND_ROLE_ROLLUP_2', u'OH_CR_MGR_ID_AND_ROLE_DEPTNAME', u'OH_CR_MGR_ID_AND_ROLE_TITLE', u'OH_CR_MGR_ID_AND_ROLE_FAMILY_DESC', u'OH_CR_MGR_ID_AND_ROLE_FAMILY', u'OH_CR_MGR_ID_AND_ROLE_ROLLUP_1', u'OH_CR_MGR_ID_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_TITLE', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_FAMILY', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_1', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_TITLE_AND_ROLE_FAMILY_DESC', u'OH_CR_ROLE_TITLE_AND_ROLE_ROLLUP_1', u'OH_CR_ROLE_TITLE_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_FAMILY_DESC_AND_ROLE_FAMILY', u'OH_CR_ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_1', u'OH_CR_ROLE_FAMIL

In [48]:
# generate count tables for origr_fset and one degree count table for orig
# one cout table['feature_name'][key] = appaerance of key in 'feature_name'
# two cout table[('f_name1', 'f_name2'][key1][key2] = appaerance of (key1, key2) 
# two cout table[('f_name1', 'f_name2'][key1]['total'] = total number of unique key2 appear with key1
# to generate counting for feature i feature set:
# use count_table.fset_one_degree_counts(base_fset, i, COUNT_TABLE_FILE)
# to generate percentage of (key i , key j) apperance in all (key i, 'feature j') 
# use count_table.fset_one_degree_counts(base_fset, i, j, 'per',COUNT_TABLE_FILE)
# to generate unique number of (key i , key j) in all (key i, 'feature j') 
# use count_table.fset_one_degree_counts(base_fset, i, j, 'num',COUNT_TABLE_FILE)

reload(myclassify)

COUNT_TABLE_FILE = './cache/count_tb.pickle'
ORIG_FSET_FILE = './cache/orig_fset.pickle'
ORIG_TUPLE_FSET_FILE = './cache/orig_tuple_fset.pickle'

count_table = MyCountTable()
count_table.load_count_tables(COUNT_TABLE_FILE)

orig_fset = OrigFeatureSet()
orig_fset.load_feature_set(ORIG_FSET_FILE)

orig_tuple_fset = OrigTupleFeatureSet()
orig_tuple_fset.load_feature_set(ORIG_TUPLE_FSET_FILE)

%time count_table.fset_generate_count_tables(orig_fset, 'both', [], COUNT_TABLE_FILE)

%time count_table.fset_generate_count_tables(orig_tuple_fset, 'one', [], COUNT_TABLE_FILE)

Loading count table file failed: file not found.
CPU times: user 30 s, sys: 96 ms, total: 30.1 s
Wall time: 30.1 s
CPU times: user 36.6 s, sys: 92 ms, total: 36.7 s
Wall time: 36.7 s


In [49]:
reload(myclassify)

# count of appearance of orig_fset
ORIG_CNT_LS_FSET_FILE = './cache/origc_ls_fset.pickle'
class OrigCntLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_fset = OrigFeatureSet()
        orig_fset.load_feature_set(ORIG_FSET_FILE)
        n_of = len(orig_fset.fname_list)
        n_train = orig_fset.Xtrain.shape[0]
        n_test = orig_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_of), float)
        self.Xtest = np.zeros((n_test, n_of), float)
        for i in xrange(n_of):
            self.Xtrain[:, i], self.Xtest[:, i] = \
                count_table.fset_one_degree_counts(orig_fset, i, COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'CNT_'+fname+u'_LS' for fname in orig_fset.fname_list]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
origc_ls_fset = OrigCntLSFeatureSet()
%time origc_ls_fset.generate_feature_set(ORIG_CNT_LS_FSET_FILE)
print origc_ls_fset.fname_list

CPU times: user 1.6 s, sys: 12 ms, total: 1.61 s
Wall time: 1.61 s
[u'CNT_RESOURCE_LS', u'CNT_MGR_ID_LS', u'CNT_ROLE_DEPTNAME_LS', u'CNT_ROLE_TITLE_LS', u'CNT_ROLE_FAMILY_DESC_LS', u'CNT_ROLE_FAMILY_LS', u'CNT_ROLE_ROLLUP_1_LS', u'CNT_ROLE_ROLLUP_2_LS']


In [50]:
reload(myclassify)

# count of appearance of 
ORIG_TUPLE_CNT_LS_FSET_FILE = './cache/otc_ls_fset.pickle'
class OrigTupleCntLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_tuple_fset = OrigTupleFeatureSet()
        orig_tuple_fset.load_feature_set(ORIG_TUPLE_FSET_FILE)
        n_otf = len(orig_tuple_fset.fname_list)
        n_train = orig_tuple_fset.Xtrain.shape[0]
        n_test = orig_tuple_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_otf), float)
        self.Xtest = np.zeros((n_test, n_otf), float)
        for i in xrange(n_otf):
            self.Xtrain[:, i], self.Xtest[:, i] = \
                count_table.fset_one_degree_counts(orig_tuple_fset, i, COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'CNT_'+fname+u'_LS' for fname in orig_tuple_fset.fname_list]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
otc_ls_fset = OrigTupleCntLSFeatureSet()
%time otc_ls_fset.generate_feature_set(ORIG_TUPLE_CNT_LS_FSET_FILE)
print otc_ls_fset.fname_list

CPU times: user 5.29 s, sys: 24 ms, total: 5.31 s
Wall time: 5.3 s
[u'CNT_RESOURCE_AND_MGR_ID_LS', u'CNT_RESOURCE_AND_ROLE_DEPTNAME_LS', u'CNT_RESOURCE_AND_ROLE_TITLE_LS', u'CNT_RESOURCE_AND_ROLE_FAMILY_DESC_LS', u'CNT_RESOURCE_AND_ROLE_FAMILY_LS', u'CNT_RESOURCE_AND_ROLE_ROLLUP_1_LS', u'CNT_RESOURCE_AND_ROLE_ROLLUP_2_LS', u'CNT_MGR_ID_AND_ROLE_DEPTNAME_LS', u'CNT_MGR_ID_AND_ROLE_TITLE_LS', u'CNT_MGR_ID_AND_ROLE_FAMILY_DESC_LS', u'CNT_MGR_ID_AND_ROLE_FAMILY_LS', u'CNT_MGR_ID_AND_ROLE_ROLLUP_1_LS', u'CNT_MGR_ID_AND_ROLE_ROLLUP_2_LS', u'CNT_ROLE_DEPTNAME_AND_ROLE_TITLE_LS', u'CNT_ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC_LS', u'CNT_ROLE_DEPTNAME_AND_ROLE_FAMILY_LS', u'CNT_ROLE_DEPTNAME_AND_ROLE_ROLLUP_1_LS', u'CNT_ROLE_DEPTNAME_AND_ROLE_ROLLUP_2_LS', u'CNT_ROLE_TITLE_AND_ROLE_FAMILY_DESC_LS', u'CNT_ROLE_TITLE_AND_ROLE_ROLLUP_1_LS', u'CNT_ROLE_TITLE_AND_ROLE_ROLLUP_2_LS', u'CNT_ROLE_FAMILY_DESC_AND_ROLE_FAMILY_LS', u'CNT_ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_1_LS', u'CNT_ROLE_FAMILY_DESC_AND_ROLE_ROL

In [51]:
reload(myclassify)

# orig_fset version of resource counts
ORSRC_PER_LS_FSET_FILE = './cache/orsrcp_ls_fset.pickle'
class ORsrcPerLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_fset = OrigFeatureSet()
        orig_fset.load_feature_set(ORIG_FSET_FILE)
        n_of = len(orig_fset.fname_list)
        n_train = orig_fset.Xtrain.shape[0]
        n_test = orig_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_of-1), float)
        self.Xtest = np.zeros((n_test, n_of-1), float)
        for i in xrange(1, n_of):
            self.Xtrain[:, i-1], self.Xtest[:, i-1] = \
                count_table.fset_two_degree_counts(orig_fset, i, 0, 'per', COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'RSRC_PER_'+fname+u'_LS' for fname in orig_fset.fname_list[1:]]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
orsrcp_ls_fset = ORsrcPerLSFeatureSet()
%time orsrcp_ls_fset.generate_feature_set(ORSRC_PER_LS_FSET_FILE)
print orsrcp_ls_fset.fname_list

ORSRC_NUM_LS_FSET_FILE = './cache/orsrcn_ls_fset.pickle'
class ORsrcNumLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_fset = OrigFeatureSet()
        orig_fset.load_feature_set(ORIG_FSET_FILE)
        n_of = len(orig_fset.fname_list)
        n_train = orig_fset.Xtrain.shape[0]
        n_test = orig_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_of-1), float)
        self.Xtest = np.zeros((n_test, n_of-1), float)
        for i in xrange(1, n_of):
            self.Xtrain[:, i-1], self.Xtest[:, i-1] = \
                count_table.fset_two_degree_counts(orig_fset, i, 0, 'num', COUNT_TABLE_FILE)
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.fname_list = [u'RSRC_NUM_'+fname+u'_LS' for fname in orig_fset.fname_list[1:]]
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
orsrcn_ls_fset = ORsrcNumLSFeatureSet()
%time orsrcn_ls_fset.generate_feature_set(ORSRC_NUM_LS_FSET_FILE)
print orsrcn_ls_fset.fname_list

CPU times: user 2.12 s, sys: 24.1 ms, total: 2.14 s
Wall time: 2.13 s
[u'RSRC_PER_MGR_ID_LS', u'RSRC_PER_ROLE_DEPTNAME_LS', u'RSRC_PER_ROLE_TITLE_LS', u'RSRC_PER_ROLE_FAMILY_DESC_LS', u'RSRC_PER_ROLE_FAMILY_LS', u'RSRC_PER_ROLE_ROLLUP_1_LS', u'RSRC_PER_ROLE_ROLLUP_2_LS']
CPU times: user 1.49 s, sys: 4.01 ms, total: 1.49 s
Wall time: 1.49 s
[u'RSRC_NUM_MGR_ID_LS', u'RSRC_NUM_ROLE_DEPTNAME_LS', u'RSRC_NUM_ROLE_TITLE_LS', u'RSRC_NUM_ROLE_FAMILY_DESC_LS', u'RSRC_NUM_ROLE_FAMILY_LS', u'RSRC_NUM_ROLE_ROLLUP_1_LS', u'RSRC_NUM_ROLE_ROLLUP_2_LS']


In [52]:
reload(myclassify)

# number of other part used by certain mgr
OMGR_UNUM_LS_FSET_FILE = './cache/omgrun_ls_fset.pickle'
class OMgrUNumLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        orig_fset = OrigFeatureSet()
        orig_fset.load_feature_set(ORIG_FSET_FILE)
        n_of = len(orig_fset.fname_list)
        n_train = orig_fset.Xtrain.shape[0]
        n_test = orig_fset.Xtest.shape[0]
        self.Xtrain = np.zeros((n_train, n_of-2), float)
        self.Xtest = np.zeros((n_test, n_of-2), float)
        for i in xrange(2, n_of):
            self.Xtrain[:, i-2], self.Xtest[:, i-2] = \
                count_table.fset_two_degree_counts(orig_fset, 1, i, 'num', COUNT_TABLE_FILE)
            self.fname_list.append(u'MGR_UNUM_'+orig_fset.fname_list[i]+u'_LS')
        myclassify.np_numeric_transform(self.Xtrain, self.Xtest, [], 'log', True)
        self.find_list = range(len(self.fname_list)+1)
        
        if file_path:
            self.save_feature_set(file_path)
        
omgrun_ls_fset = OMgrUNumLSFeatureSet()
# print omgrun_ls_fset._file_path
%time omgrun_ls_fset.generate_feature_set(OMGR_UNUM_LS_FSET_FILE)
print omgrun_ls_fset.fname_list

CPU times: user 1.38 s, sys: 4.02 ms, total: 1.38 s
Wall time: 1.38 s
[u'MGR_UNUM_ROLE_DEPTNAME_LS', u'MGR_UNUM_ROLE_TITLE_LS', u'MGR_UNUM_ROLE_FAMILY_DESC_LS', u'MGR_UNUM_ROLE_FAMILY_LS', u'MGR_UNUM_ROLE_ROLLUP_1_LS', u'MGR_UNUM_ROLE_ROLLUP_2_LS']


In [53]:
reload(myclassify)

CR_ORIGR_TCR_LS_FSET_FILE = './cache/crorigr_tcr_ls_fset.pickle'
class CROrigRTCRLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        crorigr_fset = CROrigRFeatureSet()
        crorigr_fset.load_feature_set(CR_ORIGR_FSET_FILE)
        crorig_tuple_fset = CROrigTupleFeatureSet()
        crorig_tuple_fset.load_feature_set(CR_ORIG_TUPLE_FSET_FILE)
        origc_ls_fset = OrigCntLSFeatureSet()
        origc_ls_fset.load_feature_set(ORIG_CNT_LS_FSET_FILE)
        otc_ls_fset = OrigTupleCntLSFeatureSet()
        otc_ls_fset.load_feature_set(ORIG_TUPLE_CNT_LS_FSET_FILE)
        orsrcn_ls_fset = ORsrcNumLSFeatureSet()
        orsrcn_ls_fset.load_feature_set(ORSRC_NUM_LS_FSET_FILE)
        orsrcp_ls_fset = ORsrcPerLSFeatureSet()
        orsrcp_ls_fset.load_feature_set(ORSRC_PER_LS_FSET_FILE)
        omgrun_ls_fset = OMgrUNumLSFeatureSet()
        omgrun_ls_fset.load_feature_set(OMGR_UNUM_LS_FSET_FILE)
#         odeptun_ls_fset = ODeptUNumLSFeatureSet()
#         odeptun_ls_fset.load_feature_set(ODEPT_UNUM_LS_FSET_FILE)
#         oroleun_ls_fset = ORoleUNumLSFeatureSet()
#         oroleun_ls_fset.load_feature_set(OROLE_UNUM_LS_FSET_FILE)
        
        fset_list = [crorigr_fset, crorig_tuple_fset, origc_ls_fset, otc_ls_fset, orsrcn_ls_fset, 
                     orsrcp_ls_fset, omgrun_ls_fset] #, odeptun_ls_fset, oroleun_ls_fset]
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set(fset_list)
        
        if file_path:
            self.save_feature_set(file_path)

crorigr_tcr_ls_fset = CROrigRTCRLSFeatureSet()
%time crorigr_tcr_ls_fset.generate_feature_set(CR_ORIGR_TCR_LS_FSET_FILE)
print crorigr_tcr_ls_fset.fname_list

CPU times: user 94 ms, sys: 264 ms, total: 358 ms
Wall time: 912 ms
[u'CR_RESOURCE', u'CR_MGR_ID', u'CR_ROLE_DEPTNAME', u'CR_ROLE_TITLE', u'CR_ROLE_FAMILY_DESC', u'CR_ROLE_FAMILY', u'CR_RESOURCE_AND_MGR_ID', u'CR_RESOURCE_AND_ROLE_DEPTNAME', u'CR_RESOURCE_AND_ROLE_TITLE', u'CR_RESOURCE_AND_ROLE_FAMILY_DESC', u'CR_RESOURCE_AND_ROLE_FAMILY', u'CR_RESOURCE_AND_ROLE_ROLLUP_1', u'CR_RESOURCE_AND_ROLE_ROLLUP_2', u'CR_MGR_ID_AND_ROLE_DEPTNAME', u'CR_MGR_ID_AND_ROLE_TITLE', u'CR_MGR_ID_AND_ROLE_FAMILY_DESC', u'CR_MGR_ID_AND_ROLE_FAMILY', u'CR_MGR_ID_AND_ROLE_ROLLUP_1', u'CR_MGR_ID_AND_ROLE_ROLLUP_2', u'CR_ROLE_DEPTNAME_AND_ROLE_TITLE', u'CR_ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC', u'CR_ROLE_DEPTNAME_AND_ROLE_FAMILY', u'CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_1', u'CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_2', u'CR_ROLE_TITLE_AND_ROLE_FAMILY_DESC', u'CR_ROLE_TITLE_AND_ROLE_ROLLUP_1', u'CR_ROLE_TITLE_AND_ROLE_ROLLUP_2', u'CR_ROLE_FAMILY_DESC_AND_ROLE_FAMILY', u'CR_ROLE_FAMILY_DESC_AND_ROLE_ROLLUP_1', u'CR_ROLE_FAMI

In [55]:
reload(myclassify)

OHCR_ORIGR_TCR_LS_FSET_FILE = './cache/ohcrorigr_tcr_ls_fset.pickle'
class OHCROrigRTCRLSFeatureSet(MyFeatureSet):
    def generate_feature_set(self, file_path = None):
        ohcrorigr_fset = OHCROrigRFeatureSet()
        ohcrorigr_fset.load_feature_set(OHCR_ORIGR_FSET_FILE)
        ohcrorig_tuple_fset = OHCROrigTupleFeatureSet()
        ohcrorig_tuple_fset.load_feature_set(OHCR_ORIG_TUPLE_FSET_FILE)
        origc_ls_fset = OrigCntLSFeatureSet()
        origc_ls_fset.load_feature_set(ORIG_CNT_LS_FSET_FILE)
        otc_ls_fset = OrigTupleCntLSFeatureSet()
        otc_ls_fset.load_feature_set(ORIG_TUPLE_CNT_LS_FSET_FILE)
        orsrcn_ls_fset = ORsrcNumLSFeatureSet()
        orsrcn_ls_fset.load_feature_set(ORSRC_NUM_LS_FSET_FILE)
        orsrcp_ls_fset = ORsrcPerLSFeatureSet()
        orsrcp_ls_fset.load_feature_set(ORSRC_PER_LS_FSET_FILE)
        omgrun_ls_fset = OMgrUNumLSFeatureSet()
        omgrun_ls_fset.load_feature_set(OMGR_UNUM_LS_FSET_FILE)
#         odeptun_ls_fset = ODeptUNumLSFeatureSet()
#         odeptun_ls_fset.load_feature_set(ODEPT_UNUM_LS_FSET_FILE)
#         oroleun_ls_fset = ORoleUNumLSFeatureSet()
#         oroleun_ls_fset.load_feature_set(OROLE_UNUM_LS_FSET_FILE)
        
        fset_list = [ohcrorigr_fset, ohcrorig_tuple_fset, origc_ls_fset, otc_ls_fset, orsrcn_ls_fset, 
                     orsrcp_ls_fset, omgrun_ls_fset] #, odeptun_ls_fset, oroleun_ls_fset]
        
        self.fname_list, self.find_list, self.Xtrain, self.Xtest = \
            myclassify.concat_feature_set(fset_list)
        
        if file_path:
            self.save_feature_set(file_path)

ohcrorigr_tcr_ls_fset = OHCROrigRTCRLSFeatureSet()
%time ohcrorigr_tcr_ls_fset.generate_feature_set(OHCR_ORIGR_TCR_LS_FSET_FILE)
print ohcrorigr_tcr_ls_fset.fname_list
# print ohcrorigr_tcr_ls_fset.find_list

CPU times: user 851 ms, sys: 573 ms, total: 1.42 s
Wall time: 2.73 s
[u'OH_CR_RESOURCE', u'OH_CR_MGR_ID', u'OH_CR_ROLE_DEPTNAME', u'OH_CR_ROLE_TITLE', u'OH_CR_ROLE_FAMILY_DESC', u'OH_CR_ROLE_FAMILY', u'OH_CR_RESOURCE_AND_MGR_ID', u'OH_CR_RESOURCE_AND_ROLE_DEPTNAME', u'OH_CR_RESOURCE_AND_ROLE_TITLE', u'OH_CR_RESOURCE_AND_ROLE_FAMILY_DESC', u'OH_CR_RESOURCE_AND_ROLE_FAMILY', u'OH_CR_RESOURCE_AND_ROLE_ROLLUP_1', u'OH_CR_RESOURCE_AND_ROLE_ROLLUP_2', u'OH_CR_MGR_ID_AND_ROLE_DEPTNAME', u'OH_CR_MGR_ID_AND_ROLE_TITLE', u'OH_CR_MGR_ID_AND_ROLE_FAMILY_DESC', u'OH_CR_MGR_ID_AND_ROLE_FAMILY', u'OH_CR_MGR_ID_AND_ROLE_ROLLUP_1', u'OH_CR_MGR_ID_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_TITLE', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_FAMILY_DESC', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_FAMILY', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_1', u'OH_CR_ROLE_DEPTNAME_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_TITLE_AND_ROLE_FAMILY_DESC', u'OH_CR_ROLE_TITLE_AND_ROLE_ROLLUP_1', u'OH_CR_ROLE_TITLE_AND_ROLE_ROLLUP_2', u'OH_CR_ROLE_F

In [56]:
# generate ytrain and idtest
df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

ytrain = df_train[u'ACTION'].values
idtest = df_test[u'id'].values


In [60]:
generate xgb prediction using bcmdtree_ls_fset
myxgb_params = {'objective': 'binary:logistic', 'subsample': .9, 'nthread': 4, 'seed': SEED, 'num_round':1000,
                   'learning_rate': 0.03, 'n_estimators': 1000, 'colsample_bylevel':0.85, 
                   'max_depth': 20,'gamma': 0.6, 'colsample_bytree':0.85, 'min_child_weight':0.,
                      'lambda': 0.8, 'alpha': 0}

myxgb = myclassify.MyXGBoost(myxgb_params)

myxgb.fit(crorigr_tcr_ls_fset.Xtrain, ytrain)

myxgb_ypred = myxgb.predict_proba(crorigr_tcr_ls_fset.Xtest, {'ntree_limit':800})

submission = pd.DataFrame({"id":idtest, "ACTION":myxgb_ypred})
submission = submission[['id', 'ACTION']]
submission.to_csv("xgb_submission.csv", index=False)

In [58]:
# generate logistic regression prediction using ohbcmdtree_ls_fset
mylr_params = {'C': 2., 'n_jobs':-1, 'penalty':'l2', 
               'solver':'liblinear', 'max_iter':1000 , 'tol':1e-10, 'random_state':SEED, 'verbose':0}

mylr = myclassify.MyLogisticReg(mylr_params)

mylr.fit(ohcrorigr_tcr_ls_fset.Xtrain, ytrain)

mylr_ypred = mylr.predict_proba(ohcrorigr_tcr_ls_fset.Xtest)

submission = pd.DataFrame({"id":idtest, "ACTION":mylr_ypred})
submission = submission[['id', 'ACTION']]
submission.to_csv("lr_submission.csv", index=False)