# Категориальные признаки

Категориальные признаки, в отличии от количественных, - это признак относящиеся к некоторому неупорядоченному множеству, иначе говоря, - дискретные значения.

Например: 

цвет: красный, желтый, синий

вкус: горький, сладкий, кислый

порода собаки: овчарка, акита, такса

Невозможно сравнить эти признаки, нельзя сказать, что горький больше соленого или желтый меньше красного. Точно так же невозможно использовать эти признаки в линейных моделях, так как невозможно умножить такой признак на какой-либо вес. 
Для этого необходимо эти признаки преобразовать. 

В качестве модельных данных возьмем Credit Approval Data Set из коллекции UCI Machine Learning Repository.

In [56]:
import numpy as np
import pandas as pd

In [41]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data'
df = pd.read_csv(url, header=None, na_values='?')
df.shape

(690, 16)

In [42]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


Как не трудно заметить, датасет имеет оба типа признаков: количественные и категориальные.
Выделим второй тип признаков отдельно.

In [45]:
categorical_columns = [i for i in df.columns if df[i].dtype.name == 'object']
print (categorical_columns)

[0, 3, 4, 5, 6, 8, 9, 11, 12, 15]


Необходимо убедиться, что все данные в таблице заполнены, а если где-то отсутствуют, то необходимо их заполнить к примеру наиболее частым значением.

In [50]:
df.count(axis=0)

0     678
1     678
2     690
3     684
4     684
5     681
6     681
7     690
8     690
9     690
10    690
11    690
12    690
13    677
14    690
15    690
dtype: int64

In [54]:
data_describe = df.describe(include=[object])
for i in categorical_columns:
    df[i] = df[i].fillna(data_describe[i]['top'])

In [55]:
df.count(axis=0)

0     690
1     678
2     690
3     690
4     690
5     690
6     690
7     690
8     690
9     690
10    690
11    690
12    690
13    677
14    690
15    690
dtype: int64

Ну вот, теперь нет пустых значений в категориальных признаках. 

# Label Encoding


Самым простым решением будет реализовать LabelEncoder из модуля preprocessing библиотеки sklearn. LabelEncoder присваивает уникальное значение для каждой дискретной величины. К примеру красный становится - 1, желтый - 2, синий - 3 и т.д.

In [64]:
from sklearn import preprocessing

lbl = preprocessing.LabelEncoder()
for column in categorical_columns:
    df[column] = lbl.fit_transform(df[column])
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,30.83,0.0,1,0,12,7,1.25,1,1,1,0,0,202.0,0,0
1,0,58.67,4.46,1,0,10,3,3.04,1,1,6,0,0,43.0,560,0
2,0,24.5,0.5,1,0,10,3,1.5,1,0,0,0,0,280.0,824,0
3,1,27.83,1.54,1,0,12,7,3.75,1,1,5,1,0,100.0,3,0
4,1,20.17,5.625,1,0,12,7,1.71,1,0,0,0,2,120.0,0,0
5,1,32.08,4.0,1,0,9,7,2.5,1,0,0,1,0,360.0,0,0
6,1,33.17,1.04,1,0,11,3,6.5,1,0,0,1,0,164.0,31285,0
7,0,22.92,11.585,1,0,2,7,0.04,1,0,0,0,0,80.0,1349,0
8,1,54.42,0.5,2,2,8,3,3.96,1,0,0,0,0,180.0,314,0
9,1,42.5,4.915,2,2,12,7,3.165,1,0,0,1,0,52.0,1442,0


Данный метод не работает с линейными моделями и будет абсолютно бесполезен в логистической регрессии. Поэтому мы перейдем к другому, более удачному способу - One-Hot Encoding.

# One-Hot Encoding

Смысл данного метода заключается в создании аполненой нолями матрицы соответствующей количеству признаков и с единицами только там где-этот признак присутствует. 
Например:

    1 красный

    2 желтый

    3 синий

    1  1  0  0

    2  0  1  0

    3  0  0  1
Эта техника реализована в sklearn.preprocessing в классе OneHotEncoder.

In [66]:
oe = preprocessing.OneHotEncoder(sparse=False)
encoded = pd.DataFrame(oe.fit_transform(df[categorical_columns]))
encoded.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
9,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


Как видно столбцов стало почти в 3 раза больше, зато данные теперь легко обрабатывать с помощью линейных моделей. 