In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('data/car_evaluation.csv')

In [26]:
data.head(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
5,vhigh,vhigh,2,2,med,high,unacc
6,vhigh,vhigh,2,2,big,low,unacc
7,vhigh,vhigh,2,2,big,med,unacc
8,vhigh,vhigh,2,2,big,high,unacc
9,vhigh,vhigh,2,4,small,low,unacc


In [4]:
data.shape

(1728, 7)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
classes     1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


### Ordinal data can be encoded one of three ways:
1. It can be assumed to be close enough to interval data — with relatively equal magnitudes between the values — to treat it as such. Social scientists make this assumption all the time with Likert scales. For example, “On a scale from 1 to 7, 1 being extremely unlikely, 4 being neither likely nor unlikely and 7 being extremely likely, how likely are you to recommend this movie to a friend?”. Here the difference between 3 and 4 and the difference between 6 and 7 can be reasonably assumed to be similar.
2. It can be treated as nominal data, where each category has no numeric relationship to another. One-hot encoding and other encodings appropriate for nominal data make sense here.
3. The magnitude of the difference between the numbers can be ignored. You can just train your model with different encodings and seeing which encoding works best.

<b>In some cases, the order of an ordinal data must be preserved!</b>

https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159

### 1st approach : Encoding data by replacing values

In [6]:
# encoding data by replacing values
data_rv = data.copy()

data_rv['buying'].replace(('vhigh','high','med','low'),(1,2,3,4), inplace=True)
data_rv['maint'].replace(('vhigh','high','med','low'),(1,2,3,4), inplace=True)
data_rv['doors'].replace(('2','3','4','5more'),(1,2,3,4), inplace=True)
data_rv['persons'].replace(('2','4','more'),(1,2,3), inplace=True)
data_rv['lug_boot'].replace(('small','med','big'),(1,2,3), inplace=True)
data_rv['safety'].replace(('low','med','high'),(1,2,3), inplace=True)
data_rv['classes'].replace(('unacc','acc','good','vgood'),(1,2,3,4), inplace=True)

In [7]:
data_rv['classes'].value_counts()

1    1210
2     384
3      69
4      65
Name: classes, dtype: int64

In [8]:
X = data_rv.drop(['classes'], axis=1)
y = data_rv['classes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
print(accuracy_score(y_test, predictions) * 100)

93.9306358381503


### 2nd approach : Encoding data by using label encoding

In [9]:
# encoding data by using label encoding
lb_make = LabelEncoder()
data_le = data.copy()

data_le['buying'] = lb_make.fit_transform(data['buying'])
data_le['maint'] = lb_make.fit_transform(data['maint'])
data_le['doors'] = lb_make.fit_transform(data['doors'])
data_le['persons'] = lb_make.fit_transform(data['persons'])
data_le['lug_boot'] = lb_make.fit_transform(data['lug_boot'])
data_le['safety'] = lb_make.fit_transform(data['safety'])
data_le['classes'] = lb_make.fit_transform(data['classes'])

In [25]:
data_le.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [10]:
data_le['classes'].value_counts()

2    1210
0     384
1      69
3      65
Name: classes, dtype: int64

In [24]:
lb_make.classes_

array(['acc', 'good', 'unacc', 'vgood'], dtype=object)

In [11]:
X = data_le.drop(['classes'], axis=1)
y = data_le['classes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

predictions = knn.predict(X_test)
print(accuracy_score(y_test, predictions) * 100)

91.32947976878613


<b><font color='red'>Q: where the label encoder goes wrong?</font></b>