# **Encoding Categories with K-Means**

In [1]:
pip install feature-engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting feature-engine
  Downloading feature_engine-1.3.0-py2.py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 12.5 MB/s 
[?25hCollecting statsmodels>=0.11.1
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 28.0 MB/s 
Installing collected packages: statsmodels, feature-engine
  Attempting uninstall: statsmodels
    Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed feature-engine-1.3.0 statsmodels-0.13.2


In [2]:
import pandas as pd
from feature_engine.encoding import MeanEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [3]:
# let's load the data set
data = pd.read_csv("credit_approval_uci_2.csv")
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,target
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# Let's separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=["target"], axis=1),  # predictors
    data["target"],  # target
    test_size=0.3,  # percentage of observations in test set
    random_state=0,  # seed to ensure reproducibility
)

X_train.shape, X_test.shape

((483, 15), (207, 15))

In [5]:
# let's select the categorical variables
vars_categorical = X_train.select_dtypes(include="O").columns.to_list()
vars_categorical

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13']

In [6]:
# To use K-means we need to compute meaningful distance metrics.
# Thus, we need to encode the categories into values that 
# will provide useful distance metrics.
# This can be achieved through one-hot encoding, 
# frequency encoding and Target encoding.
# let's set up the target encoder
encoder = MeanEncoder(variables=vars_categorical)

# fit the encoder
encoder.fit(X_train, y_train)

# let's transform train and test sets
X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

# Let's inspect the transformed variables
X_train_enc[vars_categorical].head()

Unnamed: 0,A1,A4,A5,A6,A7,A9,A10,A12,A13
596,0.472222,0.512397,0.512397,0.451613,0.418773,0.785156,0.70283,0.445455,0.464853
303,0.472222,0.512397,0.512397,0.625,0.418773,0.070485,0.250923,0.452471,0.464853
204,0.438806,0.226087,0.226087,0.5,0.418773,0.785156,0.70283,0.452471,0.464853
351,0.438806,0.226087,0.226087,0.105263,0.146341,0.070485,0.250923,0.452471,0.464853
118,0.438806,0.512397,0.512397,0.423077,0.418773,0.785156,0.70283,0.445455,0.464853


In [7]:
# The idea with k-means encoding is to reduce the feature space
# Here we want to represent 13 variables with 5 features

kmeans = KMeans(n_clusters=5)
kmeans.fit(X_train_enc[vars_categorical])

KMeans(n_clusters=5)

## **Clusters + one-hot encoding**

In [8]:
# We have the option to encode the variables into the
# clusters they belong to:

X_train_enc["clusters"] = kmeans.fit_predict(X_train_enc[vars_categorical])
X_test_enc["clusters"] = kmeans.predict(X_test_enc[vars_categorical])
X_train_enc["clusters"].head()

596    1
303    2
204    1
351    4
118    1
Name: clusters, dtype: int32

In [9]:
# We would then drop the original variables

X_train_enc.drop(labels=vars_categorical, axis=1, inplace=True)
X_test_enc.drop(labels=vars_categorical, axis=1, inplace=True)

In [10]:
# And then perform one-hot encoding of the clusters
ohe = OneHotEncoder(variables = "clusters",  drop_last=False, ignore_format=True)
ohe.fit(X_train_enc)

# let's transform train and test sets
X_train_enc = ohe.transform(X_train_enc)
X_test_enc = ohe.transform(X_test_enc)

# The final dataset
X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,clusters_1,clusters_2,clusters_4,clusters_3,clusters_0
596,46.08,3.0,2.375,8,396.0,4159,1,0,0,0,0
303,15.92,2.875,0.085,0,120.0,0,0,1,0,0,0
204,36.33,2.125,0.085,1,50.0,1187,1,0,0,0,0
351,22.17,0.585,0.0,0,100.0,0,0,0,1,0,0
118,57.83,7.04,14.0,6,360.0,1332,1,0,0,0,0


## **Distance to clusters**

In [11]:
# let's transform train and test sets

X_train_enc = encoder.transform(X_train)
X_test_enc = encoder.transform(X_test)

In [12]:
# Alternatively we can encode the categorical
# variables into their distances to each of 
# the clusters
clusters = [f"cluster_{i}" for i in range(5)]

X_train_enc[clusters] = kmeans.transform(X_train_enc[vars_categorical])
X_test_enc[clusters] = kmeans.transform(X_test_enc[vars_categorical])

# The final dataset
X_train_enc[clusters].head()

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4
596,0.726038,0.104615,0.848369,0.464075,0.940776
303,0.528106,0.857314,0.220917,0.737578,0.473868
204,0.800452,0.355977,0.956136,0.558597,0.854256
351,0.654696,1.055825,0.614738,0.923416,0.385301
118,0.723156,0.121732,0.847238,0.465905,0.938803


In [13]:
# We would then drop the original variables
X_train_enc.drop(labels=vars_categorical, axis=1, inplace=True)
X_test_enc.drop(labels=vars_categorical, axis=1, inplace=True)

# The final dataset
X_train_enc.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4
596,46.08,3.0,2.375,8,396.0,4159,0.726038,0.104615,0.848369,0.464075,0.940776
303,15.92,2.875,0.085,0,120.0,0,0.528106,0.857314,0.220917,0.737578,0.473868
204,36.33,2.125,0.085,1,50.0,1187,0.800452,0.355977,0.956136,0.558597,0.854256
351,22.17,0.585,0.0,0,100.0,0,0.654696,1.055825,0.614738,0.923416,0.385301
118,57.83,7.04,14.0,6,360.0,1332,0.723156,0.121732,0.847238,0.465905,0.938803
