# Ordinal Encode Categorical Data

In [45]:
# example of ordinal encoding for a neural network
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from keras.models import Sequential
from keras.layers import Dense

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [46]:
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Epoch 1/100
 - 1s - loss: 0.9568 - acc: 0.7068
Epoch 2/100
 - 0s - loss: 0.8857 - acc: 0.7068
Epoch 3/100
 - 0s - loss: 0.8280 - acc: 0.6911
Epoch 4/100
 - 0s - loss: 0.7850 - acc: 0.6702
Epoch 5/100
 - 0s - loss: 0.7559 - acc: 0.6440
Epoch 6/100
 - 0s - loss: 0.7364 - acc: 0.6440
Epoch 7/100
 - 0s - loss: 0.7193 - acc: 0.6387
Epoch 8/100
 - 0s - loss: 0.7038 - acc: 0.6387
Epoch 9/100
 - 0s - loss: 0.6924 - acc: 0.6440
Epoch 10/100
 - 0s - loss: 0.6791 - acc: 0.6545
Epoch 11/100
 - 0s - loss: 0.6698 - acc: 0.6545
Epoch 12/100
 - 0s - loss: 0.6594 - acc: 0.6649
Epoch 13/100
 - 0s - loss: 0.6505 - acc: 0.6702
Epoch 14/100
 - 0s - loss: 0.6420 - acc: 0.6859
Epoch 15/100
 - 0s - loss: 0.6346 - acc: 0.6859
Epoch 16/100
 - 0s - loss: 0.6262 - acc: 0.6911
Epoch 17/100
 - 0s - loss: 0.6192 - acc: 0.6963
Epoch 18/100
 - 0s - loss: 0.6121 - acc: 0.6963
Epoch 19/100
 - 0s - loss: 0.6060 - acc: 0.7016
Epoch 20/100
 - 0s - loss: 0.5996 - acc: 0.7173
Epoch 21/100
 - 0s - loss: 0.5942 - acc: 0.7382
E

In [47]:
X

array([["'40-49'", "'premeno'", "'15-19'", ..., "'right'", "'left_up'",
        "'no'"],
       ["'50-59'", "'ge40'", "'15-19'", ..., "'right'", "'central'",
        "'no'"],
       ["'50-59'", "'ge40'", "'35-39'", ..., "'left'", "'left_low'",
        "'no'"],
       ...,
       ["'30-39'", "'premeno'", "'30-34'", ..., "'right'", "'right_up'",
        "'no'"],
       ["'50-59'", "'premeno'", "'15-19'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'50-59'", "'ge40'", "'40-44'", ..., "'left'", "'right_up'",
        "'no'"]], dtype='<U11')

In [48]:
y

array([["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
       ["'no-recurrence-events'"],
   

# One Hot Encode Categorical Data

In [43]:
# example of one hot encoding for a neural network
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	ohe = OneHotEncoder()
	ohe.fit(X_train)
	X_train_enc = ohe.transform(X_train)
	X_test_enc = ohe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [44]:
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Epoch 1/100
 - 1s - loss: 0.8628 - acc: 0.2827
Epoch 2/100
 - 0s - loss: 0.8001 - acc: 0.3298
Epoch 3/100
 - 0s - loss: 0.7503 - acc: 0.3927
Epoch 4/100
 - 0s - loss: 0.7153 - acc: 0.4712
Epoch 5/100
 - 0s - loss: 0.6855 - acc: 0.5707
Epoch 6/100
 - 0s - loss: 0.6637 - acc: 0.6387
Epoch 7/100
 - 0s - loss: 0.6438 - acc: 0.6649
Epoch 8/100
 - 0s - loss: 0.6279 - acc: 0.7120
Epoch 9/100
 - 0s - loss: 0.6142 - acc: 0.7225
Epoch 10/100
 - 0s - loss: 0.6023 - acc: 0.7277
Epoch 11/100
 - 0s - loss: 0.5909 - acc: 0.7330
Epoch 12/100
 - 0s - loss: 0.5823 - acc: 0.7225
Epoch 13/100
 - 0s - loss: 0.5741 - acc: 0.7225
Epoch 14/100
 - 0s - loss: 0.5677 - acc: 0.7277
Epoch 15/100
 - 0s - loss: 0.5613 - acc: 0.7277
Epoch 16/100
 - 0s - loss: 0.5566 - acc: 0.7277
Epoch 17/100
 - 0s - loss: 0.5516 - acc: 0.7277
Epoch 18/100
 - 0s - loss: 0.5471 - acc: 0.7330
Epoch 19/100
 - 0s - loss: 0.5433 - acc: 0.7382
Epoch 20/100
 - 0s - loss: 0.5396 - acc: 0.7382
Epoch 21/100
 - 0s - loss: 0.5358 - acc: 0.7435
E

In [13]:
y.shape

(286, 1)

In [14]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [15]:
data.dtypes

0    object
1    object
2    object
3    object
4    object
5    object
6    object
7    object
8    object
9    object
dtype: object