In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('data/car_evaluation.csv')

In [3]:
data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
classes     1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
# encoding data by replacing values
data_rv = data.copy()

data_rv['buying'].replace(('vhigh','high','med','low'),(1,2,3,4), inplace=True)
data_rv['maint'].replace(('vhigh','high','med','low'),(1,2,3,4), inplace=True)
data_rv['doors'].replace(('2','3','4','5more'),(1,2,3,4), inplace=True)
data_rv['persons'].replace(('2','4','more'),(1,2,3), inplace=True)
data_rv['lug_boot'].replace(('small','med','big'),(1,2,3), inplace=True)
data_rv['safety'].replace(('low','med','high'),(1,2,3), inplace=True)
data_rv['classes'].replace(('unacc','acc','good','vgood'),(1,2,3,4), inplace=True)

X = data_rv.drop(['classes'], axis=1)
y = data_rv['classes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [6]:
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

### First (default) approach : Using k-Means

In [7]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X_train)

predictions = kmeans.predict(X_test)
purity_score(y_test, predictions)

0.7109826589595376

"The standard k-means algorithm isn't directly applicable to categorical data, for various reasons. The sample space for categorical data is discrete, and doesn't have a natural origin. A Euclidean distance function on such a space isn't really meaningful."

https://datascience.stackexchange.com/questions/22/k-means-clustering-for-mixed-numeric-and-categorical-data/24#24

### Second (alternative) approach : Using k-Modes

In [11]:
kmodes = KModes(n_clusters=4)
kmodes.fit(X_train)

predictions = kmodes.predict(X_test)
purity_score(y_test, predictions)

0.7109826589595376

<b>Another alternative approach to cluster categorical data</b>:
* k-Medoid / Partition Around Medoid (PAM)
* k-Prototypes
* ROCK --> https://theory.stanford.edu/~sudipto/mypapers/categorical.pdf
* Hierarchical clustering (e.g. Agglomerative clustering)
* Density-based clustering (e.g. DBSCAN)