In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import tarfile
import urllib
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
import pickle
import imagehash
from PIL import Image as Img

In [2]:
# %load extract.py
train_folders = ['notMNIST_large/A', 'notMNIST_large/B', 'notMNIST_large/C', 'notMNIST_large/D', 'notMNIST_large/E', 'notMNIST_large/F', 'notMNIST_large/G', 'notMNIST_large/H', 'notMNIST_large/I', 'notMNIST_large/J']
test_folders = ['notMNIST_small/A', 'notMNIST_small/B', 'notMNIST_small/C', 'notMNIST_small/D', 'notMNIST_small/E', 'notMNIST_small/F', 'notMNIST_small/G', 'notMNIST_small/H', 'notMNIST_small/I', 'notMNIST_small/J']

In [10]:
# %load load_data.py
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.
db = {}

def load(data_folders, min_num_images, max_num_images):
  dataset = np.ndarray(
    shape=(max_num_images, image_size, image_size), dtype=np.float32)
  labels = np.ndarray(shape=(max_num_images), dtype=np.int32)
  label_index = 0
  image_index = 0
 
  for folder in data_folders:
    print(folder)
    for image in os.listdir(folder):
      if image_index >= max_num_images:
        raise Exception('More images than expected: %d >= %d' % (
          num_images, max_num_images))
      image_file = os.path.join(folder, image)
      try:
        hash_key = str(imagehash.dhash(Img.open(image_file))) # Compute hash key for new fig
        if hash_key not in db:
            db[hash_key] = image_file
            image_data = (ndimage.imread(image_file).astype(float) -
                      pixel_depth / 2) / pixel_depth 
            if image_data.shape != (image_size, image_size):
                raise Exception('Unexpected image shape: %s' % str(image_data.shape))
            dataset[image_index, :, :] = image_data
            labels[image_index] = label_index
            image_index += 1
      except IOError as e:
        print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    label_index += 1
  num_images = image_index
  dataset = dataset[0:num_images, :, :]
  labels = labels[0:num_images]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' % (
        num_images, min_num_images))
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  print('Labels:', labels.shape)
  return dataset, labels
train_dataset, train_labels = load(train_folders, 300000, 550000)
test_dataset, test_labels = load(test_folders, 9000, 20000)

notMNIST_large/A
Could not read: notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png : cannot identify image file 'notMNIST_large/A/RnJlaWdodERpc3BCb29rSXRhbGljLnR0Zg==.png' - it's ok, skipping.
Could not read: notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png : cannot identify image file 'notMNIST_large/A/SG90IE11c3RhcmQgQlROIFBvc3Rlci50dGY=.png' - it's ok, skipping.
Could not read: notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png : cannot identify image file 'notMNIST_large/A/Um9tYW5hIEJvbGQucGZi.png' - it's ok, skipping.
notMNIST_large/B
Could not read: notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png : cannot identify image file 'notMNIST_large/B/TmlraXNFRi1TZW1pQm9sZEl0YWxpYy5vdGY=.png' - it's ok, skipping.
notMNIST_large/C
notMNIST_large/D
Could not read: notMNIST_large/D/VHJhbnNpdCBCb2xkLnR0Zg==.png : cannot identify image file 'notMNIST_large/D/VHJhbnNpdCBCb2xkLnR0Zg==.png' - it's ok, skipping.
notMNIST_large/E
notMNIST_large/F
notMNIST_large/G
notMNIST_large/H


In [11]:
from sklearn.cross_validation import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(train_dataset, train_labels, test_size = 0.3, random_state = 1)

In [12]:
X_train.shape

(256718, 28, 28)

In [13]:
X_valid.shape

(110023, 28, 28)

In [14]:
Y_train.shape

(256718,)

In [15]:
Y_valid.shape

(110023,)

In [16]:
len(db)

375891

In [17]:
train_labels.shape[0] + test_labels.shape[0]

375891

In [19]:
pickle_file = 'notMNIST.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': X_train,
    'train_labels': Y_train,
    'valid_dataset': X_valid,
    'valid_labels': Y_valid,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [21]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

Compressed pickle size: 1180298256


In [30]:
tmp = X_train.reshape(X_train.shape[0], X_train.shape[1] * X_train.shape[2])

In [2]:
data = pickle.load( open( "notMNIST.pickle", "rb" ) )

In [4]:
X_train = data['train_dataset']
Y_train = data['train_labels']
test_dataset = data['test_dataset']
test_labels = data['test_labels']

In [None]:
# %load logistic_classify.py
def logistic_classify(X, Y):
    clf = LogisticRegression(C = 1)
    clf.fit(X, Y)
    
    return clf

X_train_flat = X_train.reshape((X_train.shape[0], X_train.shape[1] * X_train.shape[2]), order = 'C')
clf = logistic_classify(X_train_flat, Y_train)

print("Training data accuracy is: %f" % clf.score(X_train_flat, Y_train))

In [8]:
test_dataset_flat = test_dataset.reshape((test_dataset.shape[0], test_dataset.shape[1] * test_dataset.shape[2]), order = 'C')

In [6]:
clf.score(X_train_flat, Y_train)

0.78534422985532759

In [9]:
clf.score(test_dataset_flat, test_labels)

0.82579234972677595

In [13]:
for i in [50, 100, 1000, 5000]:
    print("Training dataset size is %d" % i)
    clf = logistic_classify(X_train_flat[0 : i, :], Y_train[0 : i])
    train_score = clf.score(X_train_flat[0 : i, :], Y_train[0 : i])
    test_score = clf.score(test_dataset_flat, test_labels)
    print("Training Score is %f, Testing Score is %f" % (train_score, test_score))

Training dataset size is 50
Training Score is 1.000000, Testing Score is 0.494863
Training dataset size is 100
Training Score is 1.000000, Testing Score is 0.633661
Training dataset size is 1000
Training Score is 1.000000, Testing Score is 0.745355
Training dataset size is 5000
Training Score is 0.920800, Testing Score is 0.765574
