In [1]:
import pandas as pd
from numpy import asarray

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv')

In [4]:
df.columns

Index([''40-49'', ''premeno'', ''15-19'', ''0-2'', ''yes'', ''3'', ''right'',
       ''left_up'', ''no'', ''recurrence-events''],
      dtype='object')

In [5]:
# retrieve the array of data
data = df.values

In [6]:
data.shape

(285, 10)

In [7]:
# separate the data into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

In [8]:
X[0:5]

array([["'50-59'", "'ge40'", "'15-19'", "'0-2'", "'no'", "'1'",
        "'right'", "'central'", "'no'"],
       ["'50-59'", "'ge40'", "'35-39'", "'0-2'", "'no'", "'2'", "'left'",
        "'left_low'", "'no'"],
       ["'40-49'", "'premeno'", "'35-39'", "'0-2'", "'yes'", "'3'",
        "'right'", "'left_low'", "'yes'"],
       ["'40-49'", "'premeno'", "'30-34'", "'3-5'", "'yes'", "'2'",
        "'left'", "'right_up'", "'no'"],
       ["'50-59'", "'premeno'", "'25-29'", "'3-5'", "'no'", "'2'",
        "'right'", "'left_up'", "'yes'"]], dtype='<U11')

In [9]:
y[0:5]

array(["'no-recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'"], dtype='<U22')

In [15]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [16]:
X_train.shape

(191, 9)

In [17]:
X_test.shape

(95, 9)

<h3>Ordinal Encoding</h3>
<p>Each unique category value is assigned a integer value. </p>

In [18]:
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train_e = ordinal_encoder.transform(X_train)
X_test_e = ordinal_encoder.transform(X_test)

In [19]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_e = label_encoder.transform(y_train)
y_test_e = label_encoder.transform(y_test)

In [20]:
# define the model and fit on the training set
model = LogisticRegression()
model.fit(X_train_e, y_train_e)
# predict on test set
yhat = model.predict(X_test_e)

In [21]:
# evaluate predictions
accuracy = accuracy_score(y_test_e, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 75.79


<h3>One-Hot Encoding</h3>
<p>The integer encoded variable is removed and one new binary variable is added for each unique integer value in the variable.</p>

In [22]:
# one-hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_encoder.fit(X_train)
X_train_e = onehot_encoder.transform(X_train)
X_test_e = onehot_encoder.transform(X_test)

In [23]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_e = label_encoder.transform(y_train)
y_test_e = label_encoder.transform(y_test)

In [24]:
# define the model and fit on the training set
model = LogisticRegression()
model.fit(X_train_e, y_train_e)
# predict on test set
yhat = model.predict(X_test_e)

In [25]:
# evaluate predictions
accuracy = accuracy_score(y_test_e, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 70.53


<h3>Dummy Variable Encoding</h3>
<p>Dummy variable encoding represents C categories with C-1 binary variables. The level with no dummy variable is known as the baseline.</p>

In [26]:
# dummy variable encode input variables
onehot_encoder = OneHotEncoder(drop='first',sparse=False)
onehot_encoder.fit(X_train)
X_train_e = onehot_encoder.transform(X_train)
X_test_e = onehot_encoder.transform(X_test)

In [27]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_e = label_encoder.transform(y_train)
y_test_e = label_encoder.transform(y_test)

In [28]:
# define the model and fit on the training set
model = LogisticRegression()
model.fit(X_train_e, y_train_e)
# predict on test set
yhat = model.predict(X_test_e)

In [29]:
# evaluate predictions
accuracy = accuracy_score(y_test_e, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 70.53


#### In this case, the ordinal encoding model achieved the best classification accuracy of 75.79 percent, which is slightly better than the one-hot encoding and dummy variable encoding. The one-hot encoding and dummy variable encoding resulted in the same accuracy score of 70.53 percent.