# Categorical Features

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (8, 8)
plt.rcParams["font.size"] = 14

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish',
                                  'cat', 'dog', 'cat', 'fish'],
                     'rating':   ['good', 'good', 'best', 'great',
                                  'great', 'good', 'great', 'good'],
                     'children': [4., 6, 3, 3, 2, 3, 5, 4],
                     'salary':   [90, 24, 44, 27, 32, 59, 36,
                                  27]})
data

Unnamed: 0,children,pet,rating,salary
0,4.0,cat,good,90
1,6.0,dog,good,24
2,3.0,dog,best,44
3,3.0,fish,great,27
4,2.0,cat,great,32
5,3.0,dog,good,59
6,5.0,cat,great,36
7,4.0,fish,good,27


## One-hot encoding

In [3]:
import category_encoders as ce


encoder = ce.OneHotEncoder()
encoder.fit_transform(data)

Unnamed: 0,pet_0,pet_1,pet_2,pet_-1,rating_0,rating_1,rating_2,rating_-1,children,salary
0,1,0,0,0,1,0,0,0,4.0,90
1,0,1,0,0,1,0,0,0,6.0,24
2,0,1,0,0,0,1,0,0,3.0,44
3,0,0,1,0,0,0,1,0,3.0,27
4,1,0,0,0,0,0,1,0,2.0,32
5,0,1,0,0,1,0,0,0,3.0,59
6,1,0,0,0,0,0,1,0,5.0,36
7,0,0,1,0,1,0,0,0,4.0,27


In [4]:
encoder.transform(pd.DataFrame({'pet': ['cat', 'dog'],
                                'rating': ['good', 'good'],
                                'children': [4., 4],
                                'salary':   [90, 24,]}))

Unnamed: 0,pet_0,pet_1,pet_2,pet_-1,rating_0,rating_1,rating_2,rating_-1,children,salary
0,1,0,0,0,1,0,0,0,4.0,90
1,0,1,0,0,1,0,0,0,4.0,24


In [5]:
encoder.transform(pd.DataFrame({'pet': ['ape', 'dog'],
                                'rating': ['good', 'good'],
                                'children': [4., 4],
                                'salary':   [90, 24,]}))

Unnamed: 0,pet_0,pet_1,pet_2,pet_-1,rating_0,rating_1,rating_2,rating_-1,children,salary
0,0,0,0,1,1,0,0,0,4.0,90
1,0,1,0,0,1,0,0,0,4.0,24


## Ordinal encoding

If there is meaning to the order of your categories.

In [6]:
encoder = ce.OrdinalEncoder()
encoder.fit_transform(data)

Unnamed: 0,children,salary,pet,rating
0,4.0,90,0,0
1,6.0,24,1,0
2,3.0,44,1,1
3,3.0,27,2,2
4,2.0,32,0,2
5,3.0,59,1,0
6,5.0,36,0,2
7,4.0,27,2,0


In [7]:
encoder = ce.OrdinalEncoder(mapping=[{'col': 'rating', 
                                      'mapping': [('good', 0), ('great', 1), ('best', 2)]}])
encoder.fit_transform(data)

Unnamed: 0,children,pet,salary,rating
0,4.0,cat,90,0
1,6.0,dog,24,0
2,3.0,dog,44,2
3,3.0,fish,27,1
4,2.0,cat,32,1
5,3.0,dog,59,0
6,5.0,cat,36,1
7,4.0,fish,27,0


## Melbourne housing prices

In [8]:
melbourne_data = pd.read_csv('../../data/melb_data.csv')
melbourne_data = melbourne_data.drop(columns=['Unnamed: 0'])

y = melbourne_data.Price
X = melbourne_data.drop(columns=['Price'])

# select categorical features only
X = X.select_dtypes(include=['object'])

In [9]:
X.head()

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,CouncilArea,Regionname
0,Abbotsford,85 Turner St,h,S,Biggin,3/12/2016,Yarra,Northern Metropolitan
1,Abbotsford,25 Bloomburg St,h,S,Biggin,4/02/2016,Yarra,Northern Metropolitan
2,Abbotsford,5 Charles St,h,SP,Biggin,4/03/2017,Yarra,Northern Metropolitan
3,Abbotsford,40 Federation La,h,PI,Biggin,4/03/2017,Yarra,Northern Metropolitan
4,Abbotsford,55a Park St,h,VB,Nelson,4/06/2016,Yarra,Northern Metropolitan


In [10]:
# some categorical features have missing values too
np.any(X.isnull(), axis=0)

array([False, False, False, False, False, False,  True,  True])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, shuffle=True)

This approach does not work very well if you have a category with many different possible values. You end up with a very large dataset which contains mostly zeros.

In [12]:
X_train.describe()

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,CouncilArea,Regionname
count,13797,13797,13797,13797,13797,13797,9086,13796
unique,320,13639,3,5,286,58,31,8
top,Reservoir,16 Smith St,h,S,Nelson,23/09/2017,Boroondara,Southern Metropolitan
freq,403,3,9081,9008,1472,464,866,4750


In [13]:
encoder = ce.OneHotEncoder()
encoder.fit_transform(X_train.sample(100)).head(10).T

Unnamed: 0,1975,16400,9034,17592,5132,13168,1876,3457,4889,3510
Suburb_0,1,0,0,0,0,0,0,0,0,0
Suburb_1,0,1,0,0,0,0,0,0,0,0
Suburb_2,0,0,1,0,0,0,0,0,0,0
Suburb_3,0,0,0,1,0,0,0,0,0,0
Suburb_4,0,0,0,0,1,0,0,0,0,0
Suburb_5,0,0,0,0,0,1,0,0,0,0
Suburb_6,0,0,0,0,0,0,1,0,0,0
Suburb_7,0,0,0,0,0,0,0,1,0,0
Suburb_8,0,0,0,0,0,0,0,0,1,0
Suburb_9,0,0,0,0,0,0,0,0,0,1


In [14]:
encoder = ce.HashingEncoder()
encoder.fit_transform(X_train).head(10)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
791,1,2,1,1,0,2,0,1
16190,3,1,0,0,1,1,1,1
10231,0,3,0,2,0,0,1,2
337,1,1,0,1,2,2,0,1
11856,3,1,0,0,1,1,1,1
17380,1,0,0,1,1,1,2,2
2344,2,2,0,0,2,1,0,1
4004,0,2,0,0,2,1,1,2
13484,1,2,1,1,0,0,2,1
9553,1,3,0,1,0,1,2,0


## Illustrate hashing trick

Use our small pet-rating-salary dataset to illustrate how the hashing trick works.

In [15]:
data

Unnamed: 0,children,pet,rating,salary
0,4.0,cat,good,90
1,6.0,dog,good,24
2,3.0,dog,best,44
3,3.0,fish,great,27
4,2.0,cat,great,32
5,3.0,dog,good,59
6,5.0,cat,great,36
7,4.0,fish,good,27


In [16]:
encoder = ce.HashingEncoder()
encoder.fit_transform(data)

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,children,salary
0,2,0,0,0,0,0,0,0,4.0,90
1,1,0,0,0,0,1,0,0,6.0,24
2,0,0,0,0,0,1,1,0,3.0,44
3,0,1,0,0,0,0,0,1,3.0,27
4,1,0,0,0,0,0,0,1,2.0,32
5,1,0,0,0,0,1,0,0,3.0,59
6,1,0,0,0,0,0,0,1,5.0,36
7,1,1,0,0,0,0,0,0,4.0,27


## Count based encoding

In [17]:
data = melbourne_data.copy()
price_by_suburb = data.groupby("Suburb").Price
average_price = price_by_suburb.mean() / price_by_suburb.mean().max()
average_price

Suburb
Abbotsford            0.455722
Aberfeldie            0.562071
Airport West          0.327275
Albanvale             0.236657
Albert Park           0.854346
Albion                0.267431
Alphington            0.576320
Altona                0.374270
Altona Meadows        0.280310
Altona North          0.346481
Ardeer                0.274099
Armadale              0.717729
Ascot Vale            0.463378
Ashburton             0.736776
Ashwood               0.495888
Aspendale             0.536110
Aspendale Gardens     0.385947
Attwood               0.369224
Avondale Heights      0.368952
Bacchus Marsh         0.126528
Balaclava             0.340374
Balwyn                0.809855
Balwyn North          0.776566
Bayswater             0.315072
Bayswater North       0.320759
Beaconsfield          0.267928
Beaconsfield Upper    0.319427
Beaumaris             0.688863
Bellfield             0.359030
Bentleigh             0.566994
                        ...   
Upwey                 0.255275
V

In [18]:
data['Suburb_freq'] = data['Suburb'].map(average_price)

In [19]:
data[['Suburb_freq', 'Suburb']].sample(10)

Unnamed: 0,Suburb_freq,Suburb
13545,0.688863,Beaumaris
18118,0.242989,Hoppers Crossing
9458,0.717729,Armadale
4324,0.537357,Ivanhoe
11532,0.431183,Brunswick
4900,0.658602,Malvern East
13640,0.563709,Fitzroy
13551,0.374243,Boronia
17838,0.453766,Vermont
3039,0.464991,Essendon


## CatBoost

In [20]:
import catboost

In [21]:
X_ = melbourne_data.dropna()

y = X_['Price']
X = X_.drop(columns=['Price'])

# select categorical features only
X = X.select_dtypes(include=['object'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, shuffle=True)

In [22]:
cb = catboost.CatBoostRegressor()

In [23]:
cb.fit(X_train, y_train, cat_features=list(range(8)))

0:	learn: 1244922.8376450	total: 75.8ms	remaining: 1m 15s
1:	learn: 1214164.4278275	total: 91.5ms	remaining: 45.6s
2:	learn: 1188971.2811898	total: 96.6ms	remaining: 32.1s
3:	learn: 1160577.6347637	total: 106ms	remaining: 26.5s
4:	learn: 1132937.1302716	total: 118ms	remaining: 23.4s
5:	learn: 1106263.8605961	total: 134ms	remaining: 22.3s
6:	learn: 1080465.3120870	total: 150ms	remaining: 21.3s
7:	learn: 1055717.1542746	total: 164ms	remaining: 20.4s
8:	learn: 1031997.8816167	total: 174ms	remaining: 19.2s
9:	learn: 1009568.5584506	total: 181ms	remaining: 18s
10:	learn: 987386.6022994	total: 197ms	remaining: 17.7s
11:	learn: 966164.1875225	total: 210ms	remaining: 17.3s
12:	learn: 945928.9562110	total: 218ms	remaining: 16.5s
13:	learn: 926138.4846185	total: 234ms	remaining: 16.5s
14:	learn: 906722.5024266	total: 251ms	remaining: 16.5s
15:	learn: 888683.2488516	total: 259ms	remaining: 15.9s
16:	learn: 870875.1728404	total: 276ms	remaining: 16s
17:	learn: 853875.6695252	total: 289ms	remaining

155:	learn: 459152.9506094	total: 2.78s	remaining: 15s
156:	learn: 459084.2981361	total: 2.8s	remaining: 15.1s
157:	learn: 459063.5934459	total: 2.81s	remaining: 15s
158:	learn: 458609.5110498	total: 2.83s	remaining: 15s
159:	learn: 458412.8010917	total: 2.86s	remaining: 15s
160:	learn: 458407.7321395	total: 2.87s	remaining: 14.9s
161:	learn: 458341.5204539	total: 2.89s	remaining: 15s
162:	learn: 458283.4658740	total: 2.91s	remaining: 14.9s
163:	learn: 458224.5266411	total: 2.93s	remaining: 14.9s
164:	learn: 458208.9255875	total: 2.94s	remaining: 14.9s
165:	learn: 458017.5464395	total: 2.96s	remaining: 14.8s
166:	learn: 458013.2002199	total: 2.96s	remaining: 14.8s
167:	learn: 457904.6907159	total: 3s	remaining: 14.8s
168:	learn: 457824.4824075	total: 3.03s	remaining: 14.9s
169:	learn: 457703.5456586	total: 3.06s	remaining: 15s
170:	learn: 457465.6695511	total: 3.1s	remaining: 15s
171:	learn: 457462.4859981	total: 3.11s	remaining: 15s
172:	learn: 457284.8429938	total: 3.14s	remaining: 1

307:	learn: 439872.3539886	total: 5.27s	remaining: 11.8s
308:	learn: 439779.8984874	total: 5.29s	remaining: 11.8s
309:	learn: 439683.1467502	total: 5.3s	remaining: 11.8s
310:	learn: 439539.1573153	total: 5.31s	remaining: 11.8s
311:	learn: 439328.0935385	total: 5.33s	remaining: 11.7s
312:	learn: 439100.9185153	total: 5.34s	remaining: 11.7s
313:	learn: 438987.7923825	total: 5.36s	remaining: 11.7s
314:	learn: 438783.2340441	total: 5.37s	remaining: 11.7s
315:	learn: 438614.8763103	total: 5.39s	remaining: 11.7s
316:	learn: 438532.7423565	total: 5.41s	remaining: 11.7s
317:	learn: 438402.2969475	total: 5.43s	remaining: 11.6s
318:	learn: 438222.2610227	total: 5.45s	remaining: 11.6s
319:	learn: 438176.6377058	total: 5.46s	remaining: 11.6s
320:	learn: 437965.9735997	total: 5.48s	remaining: 11.6s
321:	learn: 437807.9719683	total: 5.5s	remaining: 11.6s
322:	learn: 437643.9527156	total: 5.52s	remaining: 11.6s
323:	learn: 437290.9765646	total: 5.53s	remaining: 11.5s
324:	learn: 437236.3373451	total:

456:	learn: 420391.9467142	total: 7.77s	remaining: 9.24s
457:	learn: 420296.7319219	total: 7.79s	remaining: 9.22s
458:	learn: 420167.4467709	total: 7.8s	remaining: 9.2s
459:	learn: 420069.4214193	total: 7.82s	remaining: 9.18s
460:	learn: 420020.6204338	total: 7.83s	remaining: 9.16s
461:	learn: 419986.0376599	total: 7.85s	remaining: 9.14s
462:	learn: 419878.3623089	total: 7.86s	remaining: 9.12s
463:	learn: 419747.8767286	total: 7.87s	remaining: 9.1s
464:	learn: 419658.4023343	total: 7.89s	remaining: 9.08s
465:	learn: 419565.2456611	total: 7.9s	remaining: 9.05s
466:	learn: 419522.6230926	total: 7.92s	remaining: 9.04s
467:	learn: 419411.4071330	total: 7.93s	remaining: 9.02s
468:	learn: 419351.9199311	total: 7.95s	remaining: 9s
469:	learn: 419288.3254888	total: 7.96s	remaining: 8.98s
470:	learn: 419216.2819460	total: 7.98s	remaining: 8.96s
471:	learn: 419112.0438540	total: 7.99s	remaining: 8.94s
472:	learn: 419035.6978454	total: 8.01s	remaining: 8.92s
473:	learn: 418957.7644966	total: 8.02

609:	learn: 403272.2851977	total: 10s	remaining: 6.41s
610:	learn: 403168.1316887	total: 10s	remaining: 6.4s
611:	learn: 403105.0370117	total: 10.1s	remaining: 6.38s
612:	learn: 403047.8069342	total: 10.1s	remaining: 6.36s
613:	learn: 402947.9461439	total: 10.1s	remaining: 6.34s
614:	learn: 402882.6275367	total: 10.1s	remaining: 6.33s
615:	learn: 402795.3353219	total: 10.1s	remaining: 6.31s
616:	learn: 402740.0575466	total: 10.1s	remaining: 6.29s
617:	learn: 402686.8160538	total: 10.1s	remaining: 6.27s
618:	learn: 402639.5964113	total: 10.2s	remaining: 6.25s
619:	learn: 402548.5493115	total: 10.2s	remaining: 6.23s
620:	learn: 402379.1070295	total: 10.2s	remaining: 6.21s
621:	learn: 402306.7570647	total: 10.2s	remaining: 6.2s
622:	learn: 402129.6855254	total: 10.2s	remaining: 6.18s
623:	learn: 402018.7527982	total: 10.2s	remaining: 6.16s
624:	learn: 401958.8245038	total: 10.2s	remaining: 6.14s
625:	learn: 401717.2528061	total: 10.3s	remaining: 6.13s
626:	learn: 401620.1643337	total: 10.

766:	learn: 389669.7260686	total: 12.5s	remaining: 3.8s
767:	learn: 389463.6943196	total: 12.5s	remaining: 3.78s
768:	learn: 389409.1736896	total: 12.5s	remaining: 3.77s
769:	learn: 389332.2494332	total: 12.6s	remaining: 3.75s
770:	learn: 389293.5183897	total: 12.6s	remaining: 3.73s
771:	learn: 389183.5860381	total: 12.6s	remaining: 3.72s
772:	learn: 389166.6361746	total: 12.6s	remaining: 3.7s
773:	learn: 389081.6446938	total: 12.6s	remaining: 3.68s
774:	learn: 388905.8863167	total: 12.6s	remaining: 3.67s
775:	learn: 388847.9573354	total: 12.6s	remaining: 3.65s
776:	learn: 388772.2108900	total: 12.7s	remaining: 3.63s
777:	learn: 388691.1246773	total: 12.7s	remaining: 3.61s
778:	learn: 388619.3387887	total: 12.7s	remaining: 3.6s
779:	learn: 388449.5093511	total: 12.7s	remaining: 3.58s
780:	learn: 388392.5777455	total: 12.7s	remaining: 3.56s
781:	learn: 388370.9699078	total: 12.7s	remaining: 3.54s
782:	learn: 388306.9763243	total: 12.7s	remaining: 3.53s
783:	learn: 388260.4905044	total: 

917:	learn: 377600.1543040	total: 15s	remaining: 1.34s
918:	learn: 377583.3208894	total: 15s	remaining: 1.32s
919:	learn: 377488.1841740	total: 15s	remaining: 1.31s
920:	learn: 377440.7851878	total: 15.1s	remaining: 1.29s
921:	learn: 377355.3708972	total: 15.1s	remaining: 1.28s
922:	learn: 377327.1033607	total: 15.1s	remaining: 1.26s
923:	learn: 377286.0083796	total: 15.1s	remaining: 1.24s
924:	learn: 377211.8287630	total: 15.1s	remaining: 1.23s
925:	learn: 377184.0271450	total: 15.2s	remaining: 1.21s
926:	learn: 377074.0502767	total: 15.2s	remaining: 1.2s
927:	learn: 377012.0121073	total: 15.2s	remaining: 1.18s
928:	learn: 376977.4537968	total: 15.2s	remaining: 1.16s
929:	learn: 376950.1541556	total: 15.2s	remaining: 1.15s
930:	learn: 376887.8396440	total: 15.3s	remaining: 1.13s
931:	learn: 376848.3195327	total: 15.3s	remaining: 1.11s
932:	learn: 376813.3356289	total: 15.3s	remaining: 1.1s
933:	learn: 376651.7324287	total: 15.3s	remaining: 1.08s
934:	learn: 376628.0274715	total: 15.3s

<catboost.core.CatBoostRegressor at 0x1114307b8>

In [24]:
cb.score(X_test, y_test)

382863.21348509577