##  reference:
* http://pbpython.com/categorical-encoding.html
* http://scikit-learn.org/stable/modules/preprocessing.html#encoding-categorical-features
* http://contrib.scikit-learn.org/categorical-encoding/
* https://medium.com/data-design/visiting-categorical-features-and-encoding-in-decision-trees-53400fa65931
* https://stats.idre.ucla.edu/spss/webbooks/reg/chapter5/regression-with-spsschapter-5-additional-coding-systems-for-categorical-variables-in-regressionanalysis/
* http://www.statsmodels.org/dev/contrasts.html

## pandas

#### 1. 数据读取：

In [2]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


#### 2. 打印数据类型：

In [4]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

#### 3. 选取非数字类型特征

In [5]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


#### 4. 空数特征提取和观察

In [6]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


#### 5. 判断含有空数据的特征列的数据分布

###### * 使用最常见列填充

In [7]:
obj_df['num_doors'].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

###### * 均值填充／中位数填充

In [39]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

X = [[np.nan, 2], [6, np.nan], [7, 6]]
print(imp.transform(X))  

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


###### *稀疏矩阵填充

In [41]:
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit(X)

X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
print(imp.transform(X_test))  

[[4.         2.        ]
 [6.         3.66666667]
 [7.         6.        ]]


#### 6. 进行空数据补充（方法有很多）

In [9]:
obj_df = obj_df.fillna({'num_doors':'four'})

#### 7. 转换
##### a. 数据查找并替换

* 查看数据分布

In [10]:
obj_df["num_cylinders"].value_counts()

four      159
six        24
five       11
eight       5
two         4
twelve      1
three       1
Name: num_cylinders, dtype: int64

* 构建替换用词典

In [11]:
cleanup_nums = {"num_doors":     {"four": 4, "two": 2},
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8,
                                  "two": 2, "twelve": 12, "three":3 }}

In [12]:
obj_df.replace(cleanup_nums, inplace=True)
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


##### b. label encoding (缺点：直接用number sequence进行替代不能反应真实的特征)

In [13]:
obj_df["body_style"] = obj_df["body_style"].astype('category')
obj_df.dtypes

make                 object
fuel_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
dtype: object

In [14]:
obj_df["body_style_cat"] = obj_df["body_style"].cat.codes
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,0
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,2
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,3
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,3


##### c. one hot encoding（缺点： 在被转换特征纬度特别大的情况下会产生数据悉数问题）

In [15]:
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat,body_convertible,body_hardtop,body_hatchback,body_sedan,body_wagon,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,0,1,0,0,0,0,0,0,1
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,2,0,0,1,0,0,0,0,1
3,audi,gas,std,4,front,ohc,4,mpfi,3,0,0,0,1,0,0,1,0
4,audi,gas,std,4,front,ohc,5,mpfi,3,0,0,0,1,0,1,0,0


##### d. Binary encoding (Hybrid)

In [18]:
obj_df["engine_type"].value_counts()

ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: engine_type, dtype: int64

In [21]:
obj_df["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, 0)

In [22]:
obj_df[["make", "engine_type", "OHC_Code"]].head()

Unnamed: 0,make,engine_type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1


## sklearn

##### a. label encoding

In [25]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
obj_df["make_code"] = lb_make.fit_transform(obj_df["make"])
obj_df[["make", "make_code"]].head(11)

Unnamed: 0,make,make_code
0,alfa-romero,0
1,alfa-romero,0
2,alfa-romero,0
3,audi,1
4,audi,1
5,audi,1
6,audi,1
7,audi,1
8,audi,1
9,audi,1


##### b. Label Binarizer

In [27]:
from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(obj_df["body_style"])
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0


###### c. one-hot

In [46]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4])
# Note that there are missing categorical values for the 2nd and 3rd
# features
enc.fit([[1, 2, 3], [0, 2, 0]])  


enc.transform([[1, 0, 0]]).toarray()

array([[0., 1., 1., 0., 0., 1., 0., 0., 0.]])

###### d.Custom trasformers

In [48]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])

## Catogory_encoder

##### a. backward difference encoder
    原理：不断的跟前面的level的值进行比较来计算level比较下的feature（）
    这种方法适用于nominal(均值和方差没有意义，但是mode有意义)或者ordinal特征的数据（有顺序或者无顺序的都可被适用）
    nominal例子：
    * Gender (Male, Female, Transgender).
    * Eye color (Blue, Green, Brown, Hazel).
    * Type of house (Bungalow, Duplex, Ranch).
    * Type of pet (Dog, Cat, Rodent, Fish, Bird).
    * Genotype ( AA, Aa, or aa).
    ordinal例子：
    * High school class rankings: 1st, 2nd, 3rd etc..
    * Social economic class: working, middle, upper.
    * The Likert Scale: agree, strongly agree, disagree etc..

In [31]:
import category_encoders as ce

# Get a new clean dataframe
obj_df = df.select_dtypes(include=['object']).copy()

# Specify the columns to encode then fit and transform
encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)

# Only display the first 8 columns for brevity
encoder.transform(obj_df).iloc[:,0:7].head()

Unnamed: 0,col_engine_type_0,col_engine_type_1,col_engine_type_2,col_engine_type_3,col_engine_type_4,col_engine_type_5,col_engine_type_6
0,1.0,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
1,1.0,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
2,1.0,0.142857,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
3,1.0,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857
4,1.0,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857


##### b. polynomial encoding

In [33]:
encoder = ce.polynomial.PolynomialEncoder(cols=["engine_type"])
encoder.fit(obj_df, verbose=1)
encoder.transform(obj_df).iloc[:,0:7].head()

Unnamed: 0,col_engine_type_0,col_engine_type_1,col_engine_type_2,col_engine_type_3,col_engine_type_4,col_engine_type_5,col_engine_type_6
0,1.0,-0.566947,0.5455447,-0.408248,0.241747,-0.109109,0.032898
1,1.0,-0.566947,0.5455447,-0.408248,0.241747,-0.109109,0.032898
2,1.0,-0.377964,-1.3107410000000001e-17,0.408248,-0.564076,0.436436,-0.197386
3,1.0,-0.188982,-0.3273268,0.408248,0.080582,-0.545545,0.493464
4,1.0,-0.188982,-0.3273268,0.408248,0.080582,-0.545545,0.493464


###### c. binary encoding（可以和one-hot+pca的方法进行比较）
    先将categorical特征编码成整数，然后转换成二进制数进行存储。二进制数的每一位代表一个纬度的特征。
    这种方法适用于nominal和ordinal的数据类型。比起one-hot的方法，这种方法不会产生或多的特征纬度。在数据量没有变化的情况下增加特征的纬度会产生过拟合，影响分类和预测的效果。

In [55]:
encoder = ce.binary.BinaryEncoder(cols=["make"])
encoder.fit(obj_df, verbose=1)
encoder.transform(obj_df).iloc[:,0:7].head()

Unnamed: 0,make_0,make_1,make_2,make_3,make_4,fuel_type,aspiration
0,0,0,0,0,0,gas,std
1,0,0,0,0,0,gas,std
2,0,0,0,0,0,gas,std
3,0,0,0,0,1,gas,std
4,0,0,0,0,1,gas,std


###### d. hashing

In [57]:
encoder = ce.hashing.HashingEncoder(cols=["make"])
encoder.fit(obj_df, verbose=1)
encoder.transform(obj_df).iloc[:,0:10].head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,fuel_type,aspiration
0,0,0,0,0,0,0,1,0,gas,std
1,0,0,0,0,0,0,1,0,gas,std
2,0,0,0,0,0,0,1,0,gas,std
3,1,0,0,0,0,0,0,0,gas,std
4,1,0,0,0,0,0,0,0,gas,std
