# **Feature Hashing**

In [1]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.4.1-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 4.0 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.4.1


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders.hashing import HashingEncoder

  import pandas.util.testing as tm


In [3]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [5]:
# Let's inspect the unique categories of A7
X_train["A7"].unique()

array(['v', 'ff', 'h', 'dd', 'z', 'bb', 'j', 'Missing', 'n', 'o'],
      dtype=object)

## **Hashing one feature**

In [6]:
# set up the feature hashing encoder to encode
# one variable in the dataframe
encoder = HashingEncoder(cols=["A7"], n_components=4)

In [8]:
# fit the transformer to the train set
encoder.fit(X_train)

HashingEncoder(cols=['A7'], max_process=1, max_sample=483, n_components=4)

In [9]:
# the hashing method used
encoder.hash_method

'md5'

In [10]:
# We can find the algorithms available for hashing
# in the hashlib library
import hashlib
hashlib.algorithms_available

{'blake2b',
 'blake2s',
 'md5',
 'sha1',
 'sha224',
 'sha256',
 'sha384',
 'sha3_224',
 'sha3_256',
 'sha3_384',
 'sha3_512',
 'sha512',
 'shake_128',
 'shake_256'}

In [11]:
# let's transform train and test sets
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [12]:
# We see the hashed features at the left of the dataframe
X_train_enc.head()

Unnamed: 0,col_0,col_1,col_2,col_3,A1,A2,A3,A4,A5,A6,A8,A9,A10,A11,A12,A13,A14,A15
596,0,0,1,0,a,46.08,3.0,u,g,c,2.375,t,t,8,t,g,396.0,4159
303,0,0,1,0,a,15.92,2.875,u,g,q,0.085,f,f,0,f,g,120.0,0
204,0,0,1,0,b,36.33,2.125,y,p,w,0.085,t,t,1,f,g,50.0,1187
351,0,1,0,0,b,22.17,0.585,y,p,ff,0.0,f,f,0,f,g,100.0,0
118,0,0,1,0,b,57.83,7.04,u,g,m,14.0,t,t,6,t,g,360.0,1332


In [13]:
# We see the hashed features at the left of the dataframe
X_test_enc.head()

Unnamed: 0,col_0,col_1,col_2,col_3,A1,A2,A3,A4,A5,A6,A8,A9,A10,A11,A12,A13,A14,A15
14,0,0,1,0,a,45.83,10.5,u,g,q,5.0,t,t,7,t,g,0.0,0
586,0,1,0,0,b,64.08,20.0,u,g,x,17.5,t,t,9,t,g,0.0,1000
140,0,1,0,0,a,31.25,3.75,u,g,cc,0.625,t,t,9,t,g,181.0,0
492,0,0,1,0,b,39.25,9.5,u,g,m,6.5,t,t,14,f,g,240.0,4607
350,0,1,0,0,a,26.17,2.0,u,g,j,0.0,f,f,0,t,g,276.0,1


## **Multivariate hashing**

Category Encoders will hash all categorical variables to the same 4 columns by default. It is like "multivariate" hashing.

In [14]:
encoder = HashingEncoder(cols=["A5", "A7", "A12", "A14"], n_components=4)

In [15]:
# fit the transformer to the train set
encoder.fit(X_train)

HashingEncoder(cols=['A5', 'A7', 'A12', 'A14'], max_process=1, max_sample=483,
               n_components=4)

In [16]:
# let's transform train and test sets
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [17]:
# We see the hashed features at the left of the dataframe
# Now we see that the values can take numbers beyond 1
# which means that 2 or more variables were assigned
# to the same feature
X_train_enc.head()

Unnamed: 0,col_0,col_1,col_2,col_3,A1,A2,A3,A4,A6,A8,A9,A10,A11,A13,A15
596,0,2,2,0,a,46.08,3.0,u,c,2.375,t,t,8,g,4159
303,0,1,2,1,a,15.92,2.875,u,q,0.085,f,f,0,g,0
204,1,0,2,1,b,36.33,2.125,y,w,0.085,t,t,1,g,1187
351,0,1,2,1,b,22.17,0.585,y,ff,0.0,f,f,0,g,0
118,1,1,2,0,b,57.83,7.04,u,m,14.0,t,t,6,g,1332


In [18]:
X_test_enc.head()

Unnamed: 0,col_0,col_1,col_2,col_3,A1,A2,A3,A4,A6,A8,A9,A10,A11,A13,A15
14,1,1,2,0,a,45.83,10.5,u,q,5.0,t,t,7,g,0
586,1,2,1,0,b,64.08,20.0,u,x,17.5,t,t,9,g,1000
140,0,2,2,0,a,31.25,3.75,u,cc,0.625,t,t,9,g,0
492,0,1,2,1,b,39.25,9.5,u,m,6.5,t,t,14,g,4607
350,1,2,1,0,a,26.17,2.0,u,j,0.0,f,f,0,g,1
