In [3]:
import pandas as pd
import numpy as np
import string

Letters are represented with pixels that have been transformed into 16 attributes

Information about the Letter Recognition Dataset
The dataset is from, https://archive.ics.uci.edu/ml/datasets/Letter+Recognition

Very similar problem to MNIST
Task: classify letters

Original Pixel graphics have been transformed to a number of 16 attributes representing special characteristics, 0-15 indicate each attribute. 

Trained 2 layer NN with structure: 16-40-30-2

Reducing the dataset to perform binary classification task
- class_0: 'A'
- class_1: any other letter

# Generate Train and Test Data for Letter Recognition Dataset

In [4]:
# Define parameters
make_binary = True

In [5]:
data = pd.read_csv('data.csv')

In [6]:
# Encode letters as numbers
letters = string.ascii_uppercase
encodings = {letters[i]: i for i in range(len(letters))}
for letter in letters:
    data.loc[data['target'] == letter, 'target'] = encodings[letter]

In [7]:
if make_binary:
    # Make it a binary classification problem class_0 = 'A', class_1 = any other letter
    for letter in range(1, 26):
        data.loc[data['target'] == letter, 'target'] = 1

    # To balance the data
    class_0 = data[data['target'] == 0]
    # Shuffle and get examples from class_1 so len(class_1)==len(class_0)
    class_1 = data[data['target'] == 1].sample(frac=1).reset_index(drop=True)[0:len(class_0)]

    # Shuffle all the data
    balanced_data = pd.concat((class_0, class_1)).sample(frac=1).reset_index(drop=True)
    data = balanced_data

In [9]:
# Make target last column
target_col = data['target']
data = data.drop(['target'], axis=1)
data['target'] = target_col

data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,target
0,3,6,5,4,2,10,3,2,2,8,3,10,2,6,2,7,0
1,3,10,4,7,5,6,9,8,4,7,6,8,2,7,5,11,1
2,5,7,7,6,6,9,8,3,4,7,7,8,5,6,5,4,0
3,4,8,4,6,2,3,6,6,11,7,7,15,0,8,7,7,1
4,3,7,5,5,3,11,2,3,3,10,2,10,2,6,2,8,0


In [10]:
col_names = data.columns
assert col_names[-1]=='target', 'Column names in incorrect format'

feature_names = col_names[:-1]

In [11]:
X = data.drop(columns=['target']).values  # [rows, cols]
y = data['target'].values

In [12]:
# Scale attributes so that theyre 0-1 so that larger weights do not carry more signifcance in the network
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
X = scaler.fit_transform(X)

In [13]:
norm_data = pd.DataFrame(X, columns=feature_names)
norm_data['target'] = y

In [16]:
# Split data
from sklearn.model_selection import train_test_split
train, test = train_test_split(norm_data, test_size=0.2, random_state=1)

In [19]:
datapath = '../../../../data/LetterRecognition/'

# Save train data
train.to_csv(datapath+'_train_data.csv', index=False)

# Save test data
test.to_csv(datapath+'_test_data.csv', index=False)

In [18]:
print('There are %d training instances and %d test instances ' % (len(train), len(test))) 

There are 1262 training instances and 316 test instances 
