In [22]:
from __future__ import print_function
import imageio
import numpy as np
import pandas as pd
import os
import sys
import tarfile
from IPython.display import display, Image
from six.moves.urllib.request import urlretrieve
import feather

In [23]:
#download the compressed datasets

url = 'http://yaroslavvb.com/upload/notMNIST/'
last_percent_reported = None
data_root = 'D:\\Programming\\Projects'# Change me to store data elsewhere

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  dest_filename = os.path.join(data_root, filename)
  if force or not os.path.exists(dest_filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, dest_filename, reporthook = download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(dest_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', dest_filename)
  else:
    raise Exception(
      'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
  return dest_filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

Found and verified D:\Programming\Projects\notMNIST_large.tar.gz
Found and verified D:\Programming\Projects\notMNIST_small.tar.gz


In [24]:
#extract images from the compressed files

num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall(data_root)
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

D:\Programming\Projects\notMNIST_large already present - Skipping extraction of D:\Programming\Projects\notMNIST_large.tar.gz.
['D:\\Programming\\Projects\\notMNIST_large\\A', 'D:\\Programming\\Projects\\notMNIST_large\\B', 'D:\\Programming\\Projects\\notMNIST_large\\C', 'D:\\Programming\\Projects\\notMNIST_large\\D', 'D:\\Programming\\Projects\\notMNIST_large\\E', 'D:\\Programming\\Projects\\notMNIST_large\\F', 'D:\\Programming\\Projects\\notMNIST_large\\G', 'D:\\Programming\\Projects\\notMNIST_large\\H', 'D:\\Programming\\Projects\\notMNIST_large\\I', 'D:\\Programming\\Projects\\notMNIST_large\\J']
D:\Programming\Projects\notMNIST_small already present - Skipping extraction of D:\Programming\Projects\notMNIST_small.tar.gz.
['D:\\Programming\\Projects\\notMNIST_small\\A', 'D:\\Programming\\Projects\\notMNIST_small\\B', 'D:\\Programming\\Projects\\notMNIST_small\\C', 'D:\\Programming\\Projects\\notMNIST_small\\D', 'D:\\Programming\\Projects\\notMNIST_small\\E', 'D:\\Programming\\Projec

In [25]:
#save images in 28 * 28 matrix by imageio module

image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files) * image_size, image_size),
                         dtype=np.float32)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      image_data = (imageio.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[(num_images*28):(num_images*28+28), :] = image_data
      num_images = num_images + 1
    except (IOError, ValueError) as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images*28, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape[0]/28,',',dataset.shape[1])
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_feather(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders: #??don't need to define this?? Not like the previous function
    set_filename = folder + '.feather'
    dataset_names.append(set_filename) #save dataset names to a vector?(list)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping feathering.' % set_filename)
    else:
      print('Feathering %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      pandasdf = pd.DataFrame(dataset)
      try:
        feather.write_dataframe(pandasdf, set_filename)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_feather(train_folders, 45000)
test_datasets = maybe_feather(test_folders, 1800)

D:\Programming\Projects\notMNIST_large\A.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\B.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\C.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\D.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\E.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\F.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\G.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\H.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\I.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_large\J.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST_small\A.feather already present - Skipping feathering.
D:\Programming\Projects\notMNIST

In [26]:
#randomly choose some images into train, valid and test sets.

def feather_3d(file,image_size):
    temp = feather.read_dataframe(file)
    return np.reshape(temp.values, (-1, image_size, image_size))

def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge_datasets(feather_files, train_size, valid_size=0):
  num_classes = len(feather_files)
  valid_dataset, valid_labels = make_arrays(valid_size, image_size)
  train_dataset, train_labels = make_arrays(train_size, image_size)
  vsize_per_class = valid_size // num_classes
  tsize_per_class = train_size // num_classes
    
  start_v, start_t = 0, 0
  end_v, end_t = vsize_per_class, tsize_per_class
  end_l = vsize_per_class+tsize_per_class
  for label, feather_file in enumerate(feather_files):       #obtaining an indexed list
    try:
      letter_set = feather_3d(feather_file, image_size) 
      np.random.shuffle(letter_set)    #let's shuffle the letters to have random validation and training set
      if valid_dataset is not None:
          valid_letter = letter_set[:vsize_per_class, :, :]
          valid_dataset[start_v:end_v, :, :] = valid_letter
          valid_labels[start_v:end_v] = label
          start_v += vsize_per_class
          end_v += vsize_per_class
                    
          train_letter = letter_set[vsize_per_class:end_l, :, :] #algorithm for avoiding repetition (though not perfect)
          train_dataset[start_t:end_t, :, :] = train_letter
          train_labels[start_t:end_t] = label
          start_t += tsize_per_class
          end_t += tsize_per_class
    except Exception as e:
      print('Unable to process data from', feather_file, ':', e)
      raise
    
  return valid_dataset, valid_labels, train_dataset, train_labels


train_size = 200000
valid_size = 20000
test_size = 12000

valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
  train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size, 1)

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)

Training: (200000, 28, 28) (200000,)
Validation: (20000, 28, 28) (20000,)
Testing: (12000, 28, 28) (12000,)


In [27]:
#shuffle 3 datasets

def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
#permutation returns 0:20000 randomized indeces for shuffle
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels

train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)

In [29]:
#save them in feather format

feather_folder = os.path.join(data_root, 'notMNIST')

try:
    if not os.path.exists(feather_folder):
        os.makedirs(feather_folder)
except OSError:
    print('Error: Fail to create directory in', feather_folder)


def save_feather3d(dataset, filename,image_size = 1, force = False):
    dir = os.path.join(feather_folder, filename + '.feather')
    if os.path.exists(dir) and not force:
        print(filename + '.feather', "already existed")
    else:
        df = pd.DataFrame(dataset.reshape(-1, image_size))
        feather.write_dataframe(df, dir)
    
save_feather3d(train_dataset, 'train_dataset', image_size)
save_feather3d(test_dataset, 'test_dataset', image_size,True)
save_feather3d(valid_dataset, 'valid_dataset', image_size)
save_feather3d(train_labels, 'train_labels')
save_feather3d(test_labels, 'test_labels',force = True)
save_feather3d(valid_labels, 'valid_labels')

train_dataset.feather already existed
valid_dataset.feather already existed
train_labels.feather already existed
valid_labels.feather already existed


In [97]:

tar = tarfile.open(feather_folder + '.tar.gz', 'w:gz')

for name in os.listdir(feather_folder):
    file = os.path.join(feather_folder, name)
    tar.add(file)
tar.close()
#write the compressed file.

In [86]:
feather.write_dataframe(df, 'test.feather')
df_new = feather.read_dataframe('test.feather')
os.path.join(data_root, 'notMNIST','test_set'+'.feather')

'D:\\Programming\\Projects\\notMNIST\\test_set.feather'