In [1]:
import pandas as pd
import numpy as np
import string

# Generate Train and Test Data for MNIST Dataset

In [2]:
# Define parameters
make_binary = True

In [3]:
import tensorflow as tf
# Load data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [4]:
# Get all x and y data together
x = np.concatenate((x_train, x_test), axis=0).astype('float32')
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
# Normalizing the RGB codes by dividing it to the max RGB value
x /= 255

In [6]:
# Flatten the images
image_vector_size = 28 * 28
x = x.reshape(x.shape[0], image_vector_size)

In [7]:
# Put data processed into dataframe
data = pd.DataFrame(data=x, columns=[i + 1 for i in range(0, 784)])
data['target'] = y
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9


In [8]:
if make_binary:
    # Make it a binary classification problem class_0 = 1, class_1 = rest of digits
    for i in range(0, 10):
        if i != 1:
            y[y == i] = 0

    # Data is now unbalanced,  class_0 is much smaller than class_1
    # Shuffle and get examples from class_1 so len(class_1)==len(class_0)
    class_0 = data[data['target'] == 1]
    class_1 = data[data['target'] == 0].sample(frac=1).reset_index(drop=True)[0:len(class_0)]

    # Concatenate classes and reshuffle data
    balanced_data = pd.concat((class_0, class_1)).sample(frac=1).reset_index(drop=True)

    data = balanced_data

In [9]:
feature_names = data.columns[:-1]
feature_names

Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
       ...
       775, 776, 777, 778, 779, 780, 781, 782, 783, 784],
      dtype='object', length=784)

In [10]:
X = data.drop(columns=['target']).values
y = data['target'].values
col_names = data.columns
assert col_names[-1]=='target', 'Column names in incorrect format'

In [11]:
norm_data = pd.DataFrame(X, columns=feature_names)
norm_data['target'] = y

norm_data

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,776,777,778,779,780,781,782,783,784,target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
14778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [12]:
# Split data
from sklearn.model_selection import train_test_split
train, test = train_test_split(norm_data, test_size=0.2, random_state=1)

In [13]:
datapath = '../../../../data/MNIST/'

# Save train data
train.to_csv(datapath+'_train_data.csv', index=False)

# Save test data
test.to_csv(datapath+'_test_data.csv', index=False)

In [14]:
print('There are %d training instances and %d test instances ' % (len(train), len(test))) 

There are 11824 training instances and 2956 test instances 
