In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

## Feature columns(数据特征的确定)

In [93]:
# all feature colmuns 
column_names = ['age','class_worker', 'det_ind_code', 'det_occ_code', 'education', 'wage_per_hour', 'hs_college',
                    'marital_stat', 'major_ind_code', 'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
                    'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses', 'stock_dividends',
                    'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ',
                    'instance_weight', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                    'num_emp', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                    'own_or_self', 'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k']

In [94]:
# categorical features
categorical_columns = ['class_worker', 'det_ind_code', 'det_occ_code', 'education', 'hs_college', 'major_ind_code',
                           'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reason',
                           'full_or_part_emp', 'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat',
                           'det_hh_summ', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                           'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                           'vet_question']

In [95]:
# label
label_columns=['income_50k', 'marital_stat']

In [96]:
# numerical features
numeric_cols = list(set(column_names) - set(categorical_columns) - set(label_columns))

In [97]:
train_path = '../data/census/census-income.data.gz'
test_path = '../data/census/census-income.test.gz'

In [98]:
train_df = pd.read_csv(
        train_path,
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names
    )

In [99]:
train_df.head()

Unnamed: 0,age,class_worker,det_ind_code,det_occ_code,education,wage_per_hour,hs_college,marital_stat,major_ind_code,major_occ_code,...,country_father,country_mother,country_self,citizenship,own_or_self,vet_question,vet_benefits,weeks_worked,year,income_50k
0,73,Not in universe,0,0,High school graduate,0,Not in universe,Widowed,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,0,95,- 50000.
1,58,Self-employed-not incorporated,4,34,Some college but no degree,0,Not in universe,Divorced,Construction,Precision production craft & repair,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,2,52,94,- 50000.
2,18,Not in universe,0,0,10th grade,0,High school,Never married,Not in universe or children,Not in universe,...,Vietnam,Vietnam,Vietnam,Foreign born- Not a citizen of U S,0,Not in universe,2,0,95,- 50000.
3,9,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.
4,10,Not in universe,0,0,Children,0,Not in universe,Never married,Not in universe or children,Not in universe,...,United-States,United-States,United-States,Native- Born in the United States,0,Not in universe,0,0,94,- 50000.


In [100]:
train_df['income_50k'].value_counts()

 - 50000.    187141
 50000+.      12382
Name: income_50k, dtype: int64

In [101]:
train_df['label_income'] = (train_df['income_50k'] == ' 50000+.').astype(float)

In [102]:
train_df['label_income'].value_counts()

0.0    187141
1.0     12382
Name: label_income, dtype: int64

In [103]:
train_df.class_worker.value_counts()

 Not in universe                   100245
 Private                            72028
 Self-employed-not incorporated      8445
 Local government                    7784
 State government                    4227
 Self-employed-incorporated          3265
 Federal government                  2925
 Never worked                         439
 Without pay                          165
Name: class_worker, dtype: int64

### Label Encoding

In [14]:
from collections import Counter

In [15]:
class_worker_dict = Counter(train_df.class_worker)

In [16]:
class_worker_dict

Counter({' Not in universe': 100245,
         ' Self-employed-not incorporated': 8445,
         ' Private': 72028,
         ' Local government': 7784,
         ' Federal government': 2925,
         ' Self-employed-incorporated': 3265,
         ' State government': 4227,
         ' Never worked': 439,
         ' Without pay': 165})

In [18]:
class_worker_order_dict = {cla:i for i,cla in enumerate(class_worker_dict)}

In [19]:
train_df['class_worker_label_code'] = train_df.class_worker.apply(lambda x:class_worker_order_dict[x])

In [20]:
train_df['class_worker_label_code'].value_counts()

0    100245
2     72028
1      8445
3      7784
6      4227
5      3265
4      2925
7       439
8       165
Name: class_worker_label_code, dtype: int64

### Frequency Encoding

In [42]:
train_df['class_worker_frequency_code'] = train_df.class_worker.apply(lambda x:class_worker_dict[x])

In [43]:
train_df['class_worker_frequency_code'].value_counts()

100245    100245
72028      72028
8445        8445
7784        7784
4227        4227
3265        3265
2925        2925
439          439
165          165
Name: class_worker_frequency_code, dtype: int64

### OneHot Encoding

In [21]:
len(class_worker_order_dict)

9

In [22]:
from sklearn.preprocessing import  OneHotEncoder

In [23]:
enc = OneHotEncoder(n_values=9)

In [29]:
enc.fit(train_df[['class_worker_label_code']])



OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error', n_values=9,
              sparse=True)

In [31]:
enc.transform(train_df[['class_worker_label_code']]).toarray()[:10]

array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.]])

### Target Encoding 

In [112]:
class_worker_pd =train_df[['class_worker','label_income']].groupby('class_worker').agg({'label_income':'mean'}
                                                                     )

In [113]:
class_worker_pd

Unnamed: 0_level_0,label_income
class_worker,Unnamed: 1_level_1
Federal government,0.204103
Local government,0.108813
Never worked,0.004556
Not in universe,0.009018
Private,0.101655
Self-employed-incorporated,0.34732
Self-employed-not incorporated,0.12907
State government,0.114739
Without pay,0.006061


In [116]:
class_worker_ctr = class_worker_pd.label_income.to_dict()

In [117]:
class_worker_ctr

{' Federal government': 0.2041025641025641,
 ' Local government': 0.10881294964028777,
 ' Never worked': 0.004555808656036446,
 ' Not in universe': 0.009017906129981546,
 ' Private': 0.10165491197867496,
 ' Self-employed-incorporated': 0.3473200612557427,
 ' Self-employed-not incorporated': 0.1290704558910598,
 ' State government': 0.11473858528507215,
 ' Without pay': 0.006060606060606061}

In [118]:
train_df['class_worker_target_code'] = train_df.class_worker.apply(lambda x:class_worker_ctr[x])

In [119]:
train_df.class_worker_target_code.head()

0    0.009018
1    0.129070
2    0.009018
3    0.009018
4    0.009018
Name: class_worker_target_code, dtype: float64

In [104]:
from category_encoders.target_encoder import TargetEncoder

In [108]:
train_df['class_worker_target_code2'] = train_df['class_worker']

In [109]:
encoder = TargetEncoder(cols=['class_worker_target_code2'], 
                        handle_unknown='value',  
                        handle_missing='value').fit(train_df,train_df['label_income'])



In [110]:
train_df = encoder.transform(train_df)

In [120]:
train_df[['class_worker','class_worker_target_code2','class_worker_target_code']].head()

Unnamed: 0,class_worker,class_worker_target_code2,class_worker_target_code
0,Not in universe,0.009018,0.009018
1,Self-employed-not incorporated,0.12907,0.12907
2,Not in universe,0.009018,0.009018
3,Not in universe,0.009018,0.009018
4,Not in universe,0.009018,0.009018


In [121]:
import category_encoders as ce

In [122]:
train_df['class_worker_binary_code'] = train_df['class_worker']

In [123]:
encoder = ce.BinaryEncoder(cols=['class_worker_binary_code']).fit(train_df)

In [124]:
train_df = encoder.transform(train_df)

In [129]:
train_df[['class_worker_binary_code_0','class_worker_binary_code_1','class_worker_binary_code_2','class_worker_binary_code_3']]


Unnamed: 0,class_worker_binary_code_0,class_worker_binary_code_1,class_worker_binary_code_2,class_worker_binary_code_3
0,0,0,0,1
1,0,0,1,0
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
199518,0,0,0,1
199519,0,1,1,0
199520,0,0,0,1
199521,0,0,0,1


### CatBoost Encoding 

In [130]:
import category_encoders as encoders

In [134]:
train_df['class_worker_catboost_code'] = train_df['class_worker']
enc = encoders.CatBoostEncoder()

In [135]:

train_df['class_worker_catboost_code'] = enc.fit_transform(train_df['class_worker_catboost_code'], train_df['label_income'])

In [137]:
train_df['class_worker_catboost_code'].head()

0    0.062058
1    0.062058
2    0.031029
3    0.020686
4    0.015515
Name: class_worker_catboost_code, dtype: float64

### WOE Encoding 

In [138]:
enc = encoders.WOEEncoder()

In [139]:
train_df['class_worker_WOE_code'] = train_df['class_worker']
enc = encoders.WOEEncoder()

In [140]:
train_df['class_worker_WOE_code'] = enc.fit_transform(train_df['class_worker_WOE_code'], train_df['label_income'])

In [141]:
train_df['class_worker_WOE_code'].head()

0   -1.982921
1    0.807046
2   -1.982921
3   -1.982921
4   -1.982921
Name: class_worker_WOE_code, dtype: float64

### Helmert Encoding 

In [142]:
from category_encoders.helmert import HelmertEncoder

In [144]:
train_df['class_worker_helm_code'] = train_df['class_worker']

In [143]:
enc = HelmertEncoder()

In [145]:
enc.fit_transform(train_df['class_worker_helm_code'],train_df['label_income'])



Unnamed: 0,intercept,class_worker_helm_code_0,class_worker_helm_code_1,class_worker_helm_code_2,class_worker_helm_code_3,class_worker_helm_code_4,class_worker_helm_code_5,class_worker_helm_code_6,class_worker_helm_code_7
0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...
199518,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
199519,1,0.0,0.0,0.0,0.0,5.0,-1.0,-1.0,-1.0
199520,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
199521,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### MEstimate Encoding 

In [146]:
from category_encoders.m_estimate import MEstimateEncoder

In [147]:
train_df['class_worker_mest_code'] = train_df['class_worker']

In [148]:
enc = MEstimateEncoder()

In [149]:
enc.fit_transform(train_df['class_worker_mest_code'],train_df['label_income'])

Unnamed: 0,class_worker_mest_code
0,0.009018
1,0.129063
2,0.009018
3,0.009018
4,0.009018
...,...
199518,0.009018
199519,0.347233
199520,0.009018
199521,0.009018


In [None]:
### Sum Encoding 

In [150]:
from category_encoders.sum_coding import SumEncoder

In [151]:
train_df['class_worker_sum_code'] = train_df['class_worker']

In [152]:
enc = MEstimateEncoder()

In [153]:
enc.fit_transform(train_df['class_worker_sum_code'],train_df['label_income'])

Unnamed: 0,class_worker_sum_code
0,0.009018
1,0.129063
2,0.009018
3,0.009018
4,0.009018
...,...
199518,0.009018
199519,0.347233
199520,0.009018
199521,0.009018


### LeaveOneOut Encoding 

In [154]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

In [155]:
train_df['class_worker_leaveone_code'] = train_df['class_worker']

In [156]:
enc = LeaveOneOutEncoder()

In [157]:
enc.fit_transform(train_df['class_worker_leaveone_code'],train_df['label_income'])

Unnamed: 0,class_worker_leaveone_code
0,0.009018
1,0.129086
2,0.009018
3,0.009018
4,0.009018
...,...
199518,0.009018
199519,0.347426
199520,0.009018
199521,0.009018


### BetaTarget Encoding 

In [161]:

class BetaTargetEncoder(object):
    def __init__(self, col):
        self.group = col
        self.stats = None
    # get counts from df
    def fit(self, df, target_col):
        # 先验均值
        self.prior_mean = np.mean(df[target_col]) 
        stats           = df[[target_col, self.group]].groupby(self.group)
        # count和sum
        stats           = stats.agg(['sum', 'count'])[target_col]    
        stats.rename(columns={'sum': 'n', 'count': 'N'}, inplace=True)
        stats.reset_index(level=0, inplace=True)           
        self.stats      = stats
        
    # extract posterior statistics
    def transform(self, df, stat_type, N_min=1):
        
        df_stats = pd.merge(df[[self.group]], self.stats, how='left')
        n        = df_stats['n'].copy()
        N        = df_stats['N'].copy()
        
        # fill in missing
        nan_indexs    = np.isnan(n)
        n[nan_indexs] = self.prior_mean
        N[nan_indexs] = 1.0
        
        # prior parameters
        N_prior     = np.maximum(N_min-N, 0)
        alpha_prior = self.prior_mean*N_prior
        beta_prior  = (1-self.prior_mean)*N_prior
        
        # posterior parameters
        alpha       =  alpha_prior + n
        beta        =  beta_prior  + N-n
        
        # calculate statistics
        if stat_type=='mean':
            num = alpha
            dem = alpha+beta
                    
        elif stat_type=='mode':
            num = alpha-1
            dem = alpha+beta-2
            
        elif stat_type=='median':
            num = alpha-1/3
            dem = alpha+beta-2/3
        
        elif stat_type=='var':
            num = alpha*beta
            dem = (alpha+beta)**2*(alpha+beta+1)
                    
        elif stat_type=='skewness':
            num = 2*(beta-alpha)*np.sqrt(alpha+beta+1)
            dem = (alpha+beta+2)*np.sqrt(alpha*beta)
 
        elif stat_type=='kurtosis':
            num = 6*(alpha-beta)**2*(alpha+beta+1) - alpha*beta*(alpha+beta+2)
            dem = alpha*beta*(alpha+beta+2)*(alpha+beta+3)
            
        # replace missing
        value = num/dem
        value[np.isnan(value)] = np.nanmedian(value)
        return value


In [162]:
enc = BetaTargetEncoder(col='class_worker')

In [163]:
enc.fit(train_df,'label_income')

In [164]:
enc.transform(train_df,'mean')

0         0.009018
1         0.129070
2         0.009018
3         0.009018
4         0.009018
            ...   
199518    0.009018
199519    0.347320
199520    0.009018
199521    0.009018
199522    0.101655
Length: 199523, dtype: float64

In [165]:
enc.transform(train_df,'var')

0         8.914653e-08
1         1.330941e-05
2         8.914653e-08
3         8.914653e-08
4         8.914653e-08
              ...     
199518    8.914653e-08
199519    6.940871e-05
199520    8.914653e-08
199521    8.914653e-08
199522    1.267839e-06
Length: 199523, dtype: float64

In [166]:
enc.transform(train_df,'skewness')

0         0.065615
1         0.048147
2         0.065615
3         0.065615
4         0.065615
            ...   
199518    0.065615
199519    0.022438
199520    0.065615
199521    0.065615
199522    0.019646
Length: 199523, dtype: float64

* 对于取值数量很少（<10）的类别型特征，相应的各取值下的样本数量也比较多，可以直接Onehot编码。
* 对于取值数量比较多（10到几百），这时onehot从效率或者效果，都不及lightgbm梯度编码或catboost目标编码，而且直接使用也很方便。（需要注意的是，个人实践中这两种方法在很多取值的类别特征，还是比较容易过拟合。这时，类别值先做下经验的合并或者尝试剔除某些类别特征后，模型效果反而会更好）
* 当几百上千的类别取值，可以先onehot后（高维稀疏），借助神经网络模型做低维稠密表示。
* 对于无序的离散特征，实战中使用 OneHot, Hashing, LeaveOneOut, and Target encoding 方法效果较好，但是使用OneHot时要避免高基类别的特征以及基于决策树的模型
* 对于回归问题而言，Target 与 LeaveOneOut 方法可能不会有比较好的效果。
* LeaveOneOut、 WeightOfEvidence、 James-Stein、M-estimator 适合用来处理高基数特征。
* Helmert、 Sum、 Backward Difference、 Polynomial 在机器学习问题里的效果往往不是很好(过拟合的原因)
* 使用Beta Target Encoding相较于直接使用LightGBM建模的效果可以得到大幅提升