In [1]:
import pandas as pd
from PIL import Image
import numpy as np
import string
import os

In [2]:
path = './emnist_subset/emnist-letters-train.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,23,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.406,0.407,0.408,0.409,0.410,0.411,0.412,0.413,0.414,0.415
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Extract only data containing letters M or T
Each row is an image of an alphabet, where the identify of the alphabet is contained in the first column "23".

To extract only letters M and T, the index for M and T can be found using Python string.ascii_letters, and then passed as the filter conditions for the extraction.

In [3]:
print("\nUnique keys in df:\n ",sorted(df['23'].unique()))

print("string.ascii_letters : ", string.ascii_letters)
# Add 1 since index returned begins from 0, not 1.
key_m = string.ascii_letters.find('m') + 1
key_t = string.ascii_letters.find('t') + 1
print("\nPositions of M and T in string.ascii_letters: {} and {}.".format(key_m, key_t))

df_m = df.loc[df['23']== key_m, :]
df_t = df.loc[df['23']== key_t, :]


Unique keys in df:
  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
string.ascii_letters :  abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ

Positions of M and T in string.ascii_letters: 13 and 20.


In [4]:
df_m = df_m.iloc[0:500, 1:].to_numpy()
df_m.shape

(500, 784)

In [5]:
df_t = df_t.iloc[0:500, 1:].to_numpy()
df_t.shape

(500, 784)

## Array to Image conversion

Each filtered row is then converted into a JPEG image in RGB format.

In [6]:
def csv2images(row, output_folder, subfolder, count):
    # create new subfolder (ignore if exist), new file name
    image_Folder_Path = os.path.join(output_folder, subfolder)
    os.makedirs(image_Folder_Path, exist_ok=True)
    image_Path = image_Folder_Path + '/' + str(count) + '.jpg'
    
    #reshape a 784 array into 28 x 28 matrix
    # EMNIST data requires transpose to display correctly
    # convert from original greyscale mode to RGB mode 
    image_array = np.asarray(row)
    image_array = image_array.reshape(28, 28).T
    new_image = Image.fromarray(image_array.astype('uint8')).convert('RGB')

    new_image.save(image_Path)
    

In [7]:
folder = './emnist_subset/'
folderpath_m = 'letter_M/'

count=0
for m in df_m:
    count += 1
    csv2images(m, folder, folderpath_m, count)

In [8]:
folderpath_t = 'letter_T/'

count=0
for t in df_t:
    count += 1
    csv2images(t, folder, folderpath_t, count)

In [9]:
# Test a random image to see if it's RGB

def is_grey_scale(img_path):
    img = Image.open(img_path)
    print( "Color bands of image {} : {}".format(img_path, img.getbands()) )
    w, h = img.size
    for i in range(w):
        for j in range(h):
            r, g, b = img.getpixel((i,j))
            if r != g != b: 
                return False
    return True

f_name='./emnist_subset/letter_M/14.jpg'
print("Is the image a greyscale actually? {}".format(is_grey_scale(f_name)))

Color bands of image ./emnist_subset/letter_M/14.jpg : ('R', 'G', 'B')
Is the image a greyscale actually? True
