In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from scipy import ndimage
import cv2
from IPython.display import display, Image
from scipy import misc
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle

In [None]:
def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open('/content/drive/My Drive/'+filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root))
    if os.path.isdir(os.path.join(root, d))]
  if len(data_folders) != 10:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders

test_filename = 'notMNIST_small.tar.gz'
data_folders = maybe_extract(test_filename)

notMNIST_small already present - Skipping extraction of notMNIST_small.tar.gz.
['notMNIST_small/A', 'notMNIST_small/B', 'notMNIST_small/C', 'notMNIST_small/D', 'notMNIST_small/E', 'notMNIST_small/F', 'notMNIST_small/G', 'notMNIST_small/H', 'notMNIST_small/I', 'notMNIST_small/J']


In [None]:
# image_size = 28  # Pixel width and height.
# pixel_depth = 255.0  # Number of levels per pixel.

# def load_letter(folder, start, min_num_images):
#   """Load the data for a single letter label."""
#   image_files = os.listdir(folder)
#   dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
#                          dtype=np.float32)
#   image_index = 0
#   print(folder)
#   for image in os.listdir(folder):
#     image_file = os.path.join(folder, image)
#     try:
#       # image_data = (ndimage.imread(image_file).astype(float) - 
#       #               pixel_depth / 2) / pixel_depth
#       image_read = cv2.imread(image_file)
#       if image_read is not None:
#         image_data = (image_read.astype(float) - 
#                       pixel_depth / 2) / pixel_depth
#       if image_data.shape != (image_size, image_size, 3):
#         raise Exception('Unexpected image shape: %s' % str(image_data.shape))
#       dataset[image_index, :, :] = image_data[:, :, 0]
#       image_index += 1
#     except IOError as e:
#       print('Could not read file - it\'s ok, skipping.')
#   dataset = dataset[start:min_num_images, :, :]
#   return dataset
        
# def maybe_pickle(data_folders,start, min_num_images_per_class, force=False):
#   dataset_names = []
#   for folder in data_folders:
#     set_filename = folder + str(start) + '.pickle'
#     dataset_names.append(set_filename)
#     if os.path.exists(set_filename) and not force:
#       # You may override by setting force=True.
#       print('%s already present - Skipping pickling.' % set_filename)
#     else:
#       print('Pickling %s.' % set_filename)
#       dataset = load_letter(folder,start, min_num_images_per_class)
#       try:
#         with open(set_filename, 'wb') as f:
#           pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
#       except Exception as e:
#         print('Unable to save data to', set_filename, ':', e)
  
#   return dataset_names

# train_datasets = maybe_pickle(data_folders,0, 1500)
# validation_datasets = maybe_pickle(data_folders,1500, 1600)
# test_datasets = maybe_pickle(data_folders,1600, 1874)
image_size = 28  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder, min_num_images):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
                         dtype=np.float32)
  image_index = 0
  print(folder)
  for image in os.listdir(folder):
    image_file = os.path.join(folder, image)
    try:
      image_read = cv2.imread(image_file)
      if image_read is not None:
        image_data = (image_read.astype(float) - 
                      pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size, 3):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[image_index, :, :] = image_data[:, :, 0]
      image_index += 1
    except IOError as e:
      print('Could not read file - it\'s ok, skipping.')
    
  num_images = image_index
  dataset = dataset[0:num_images, :, :]
  if num_images < min_num_images:
    raise Exception('Many fewer images than expected: %d < %d' %
                    (num_images, min_num_images))
    
  print('Full dataset tensor:', dataset.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return dataset
        
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
  dataset_names = []
  for folder in data_folders:
    set_filename = folder + '.pickle'
    dataset_names.append(set_filename)
    if os.path.exists(set_filename) and not force:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
    else:
      print('Pickling %s.' % set_filename)
      dataset = load_letter(folder, min_num_images_per_class)
      try:
        with open(set_filename, 'wb') as f:
          pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
      except Exception as e:
        print('Unable to save data to', set_filename, ':', e)
  
  return dataset_names

train_datasets = maybe_pickle(data_folders, 1500)
validation_datasets = maybe_pickle(data_folders, 100)
test_datasets = maybe_pickle(data_folders, 272)

notMNIST_small/A.pickle already present - Skipping pickling.
notMNIST_small/B.pickle already present - Skipping pickling.
notMNIST_small/C.pickle already present - Skipping pickling.
notMNIST_small/D.pickle already present - Skipping pickling.
notMNIST_small/E.pickle already present - Skipping pickling.
notMNIST_small/F.pickle already present - Skipping pickling.
notMNIST_small/G.pickle already present - Skipping pickling.
notMNIST_small/H.pickle already present - Skipping pickling.
notMNIST_small/I.pickle already present - Skipping pickling.
notMNIST_small/J.pickle already present - Skipping pickling.
notMNIST_small/A.pickle already present - Skipping pickling.
notMNIST_small/B.pickle already present - Skipping pickling.
notMNIST_small/C.pickle already present - Skipping pickling.
notMNIST_small/D.pickle already present - Skipping pickling.
notMNIST_small/E.pickle already present - Skipping pickling.
notMNIST_small/F.pickle already present - Skipping pickling.
notMNIST_small/G.pickle 

In [None]:
# def make_arrays(nb_rows, img_size):
#   if nb_rows:
#     dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
#     labels = np.ndarray(nb_rows, dtype=np.int32)
#   else:
#     dataset, labels = None, None
#   return dataset, labels

# def merge_datasets(pickle_files, train_size):
#   num_classes = len(pickle_files)
#   train_dataset, train_labels = make_arrays(train_size, image_size)
#   tsize_per_class = train_size // num_classes
    
#   start_t = 0
#   for label, pickle_file in enumerate(pickle_files):       
#     try:
#       with open(pickle_file, 'rb') as f:
#         np.random.shuffle(letter_set)
#         letter_set = pickle.load(f)
#         train_letter = letter_set[vsize_per_class:end_l, :, :]
#         train_dataset[start_t:end_t, :, :] = train_letter
#         train_labels[start_t:end_t] = label
#         start_t += tsize_per_class
#         end_t += tsize_per_class
#     except Exception as e:
#       print('Unable to process data from', pickle_file, ':', e)
#       raise
    
#   return train_dataset, train_labels
            
            
# train_size = 15000
# valid_size = 1000
# test_size = 2726

# train_dataset, train_labels = merge_datasets(
#   train_datasets, train_size)
# test_dataset, test_labels = merge_datasets(
#   test_datasets, test_size)
# valid_dataset, valid_labels = merge_datasets(
#   validation_datasets, valid_size)
# print('Training:', train_dataset.shape, train_labels.shape)
# print('Validation:', valid_dataset.shape, valid_labels.shape)
# print('Testing:', test_dataset.shape, test_labels.shape)
# print(valid_dataset)
#############################################################################################
def load_letter_based_on_type(folder, dataset_type):
  image_files = os.listdir(folder)

  if dataset_type == 'train':
    start_index = 0
    end_index = 1500

  elif dataset_type == 'validate':
    start_index = 1500
    end_index = 1600
  elif dataset_type == 'test':
    start_index = 1600
    end_index = len(image_files)

  image_index = 0
  dataset = np.ndarray(shape=(end_index-start_index, 28, 28),
                         dtype=np.float32)
  for image in os.listdir(folder)[start_index:end_index]:
    image_file = os.path.join(folder, image)
    image_read = cv2.imread(image_file)
    if image_read is not None:
      image_data = (image_read.astype(float) - 255 / 2) / 255
      dataset[image_index, :, :] = image_data[:, :, 0]
    
    image_index += 1
    
  return dataset

def separate_folder(data_folders):
  trainingset_names = []
  testingset_names = []
  validationset_names = []
  for folder in data_folders:
    training_folder = folder + '.training'
    testing_folder = folder + '.testing'
    validation_folder = folder + '.validation'

    trainingset_names.append(training_folder)
    testingset_names.append(testing_folder)
    validationset_names.append(validation_folder)

    trainingset = load_letter_based_on_type(folder, 'train')
    testingset = load_letter_based_on_type(folder, 'test')
    validationset = load_letter_based_on_type(folder, 'validate')

    try:
      with open(validation_folder, 'wb') as f:
        pickle.dump(validationset, f, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
      print('Unable to save data to', validation_folder, ':', e)
    
    try:
      with open(testing_folder, 'wb') as g:
        pickle.dump(testingset, g, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
      print('Unable to save data to', testing_folder, ':', e)

    try:
      with open(training_folder, 'wb') as h:
        pickle.dump(trainingset, h, pickle.HIGHEST_PROTOCOL)
    except Exception as e:
      print('Unable to save data to', training_folder, ':', e)
  return {'validate_name': validationset_names,
          'training_name': trainingset_names,
          'testing_name': testingset_names}

folder_names_dictionary = separate_folder(data_folders)
print(valid_dataset)
######################################################################
# def make_arrays(nb_rows, img_size):
#   if nb_rows:
#     dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
#     labels = np.ndarray(nb_rows, dtype=np.int32)
#   else:
#     dataset, labels = None, None
#   return dataset, labels

# def merge_datasets(pickle_files, train_size, valid_size=0):
#   num_classes = len(pickle_files)
#   valid_dataset, valid_labels = make_arrays(valid_size, image_size)
#   train_dataset, train_labels = make_arrays(train_size, image_size)
#   vsize_per_class = valid_size // num_classes
#   tsize_per_class = train_size // num_classes
    
#   start_v, start_t = 0, 0
#   end_v, end_t = vsize_per_class, tsize_per_class
#   end_l = vsize_per_class+tsize_per_class
#   for label, pickle_file in enumerate(pickle_files):       
#     try:
#       with open(pickle_file, 'rb') as f:
#         letter_set = pickle.load(f)
#         # let's shuffle the letters to have random validation and training set
#         np.random.shuffle(letter_set)
#         if valid_dataset is not None:
#           valid_letter = letter_set[:vsize_per_class, :, :]
#           valid_dataset[start_v:end_v, :, :] = valid_letter
#           valid_labels[start_v:end_v] = label
#           start_v += vsize_per_class
#           end_v += vsize_per_class
                    
#         train_letter = letter_set[vsize_per_class:end_l, :, :]
#         train_dataset[start_t:end_t, :, :] = train_letter
#         train_labels[start_t:end_t] = label
#         start_t += tsize_per_class
#         end_t += tsize_per_class
#     except Exception as e:
#       print('Unable to process data from', pickle_file, ':', e)
#       raise
#   return valid_dataset, valid_labels, train_dataset, train_labels
            
            
# train_size = 15000
# valid_size = 1000
# test_size = 2720

# valid_dataset,valid_labels, train_dataset, train_labels = merge_datasets(
#   train_datasets, train_size, valid_size)
# _, _, valid_dataset, valid_labels = merge_datasets(validation_datasets, valid_size)
# _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)

# print('Training:', train_dataset.shape, train_labels.shape)
# print('Validation:', valid_dataset.shape, valid_labels.shape)
# print('Testing:', test_dataset.shape, test_labels.shape)

[[-0.5        -0.5        -0.5        ...  0.5         0.5
   0.5       ]
 [-0.5        -0.5        -0.5        ...  0.11960784 -0.24901961
  -0.5       ]
 [-0.5        -0.5        -0.5        ...  0.49607843  0.5
   0.38627452]
 ...
 [-0.5        -0.5        -0.5        ... -0.5        -0.5
  -0.5       ]
 [-0.5        -0.5        -0.5        ... -0.5        -0.5
  -0.5       ]
 [-0.5        -0.5        -0.5        ... -0.5        -0.5
  -0.5       ]]


In [None]:
def make_arrays(nb_rows, img_size):
  if nb_rows:
    dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
    labels = np.ndarray(nb_rows, dtype=np.int32)
  else:
    dataset, labels = None, None
  return dataset, labels

def merge(number, folder_names):
  dataset, labels = make_arrays(number, 28)
  start_index = 0
  label = 0
  for folder_name in folder_names:
    with open(folder_name, 'rb') as file:
      loaded_dataset = pickle.load(file)
      dataset[start_index:start_index+len(loaded_dataset), :, :] = loaded_dataset
      labels[start_index:start_index+len(loaded_dataset)] = label
      start_index += len(loaded_dataset)
      label += 1
  return dataset, labels

            
valid_dataset, valid_labels = merge(1000, folder_names_dictionary['validate_name'])
test_dataset, test_labels = merge(2726, folder_names_dictionary['testing_name'])
train_dataset, train_labels = merge(15000, folder_names_dictionary['training_name'])

print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)
# print(valid_dataset)

Training: (15000, 28, 28) (15000,)
Validation: (1000, 28, 28) (1000,)
Testing: (2726, 28, 28) (2726,)


In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [None]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (15000, 784) (15000, 10)
Validation set (1000, 784) (1000, 10)
Test set (2726, 784) (2726, 10)


# Task II

In [None]:
batch_size = 128
hidden_nodes = 1000
graph = tf.Graph()
with graph.as_default():
    #input data. For the training data, we use a placeholder that will be fed
    #at run time with a training minibatch
    tf_train_dataset = tf.compat.v1.placeholder(tf.float32,
                                    shape=(batch_size, image_size*image_size), name="td")
    tf_train_labels = tf.compat.v1.placeholder(tf.float32, shape=(batch_size, num_labels), name="tl")
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    #variables
    weights1 = tf.Variable(
        tf.compat.v1.truncated_normal([image_size*image_size, hidden_nodes]))
    biases1 = tf.Variable(tf.zeros([hidden_nodes]))
    weights2 =tf.Variable(
        tf.compat.v1.truncated_normal([hidden_nodes, num_labels]))
    biases2 = tf.Variable(tf.zeros([num_labels]))

    #training computation.
    logits_1 = tf.matmul(tf_train_dataset, weights1) + biases1
    relu1 = tf.nn.relu(logits_1)
    logits_2 = tf.matmul(relu1, weights2) + biases2
    # relu1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
    # relu_out= tf.nn.relu(tf.matmul(relu1, weights2) + biases2)

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits_2,labels=tf_train_labels))

    #optimizer
    optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.5).minimize(loss)

    #predictions for the training, validation, and test data
    train_prediction = tf.nn.softmax(logits_2)
    valid_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset,weights1) +biases1), weights2) + biases2)
    test_prediction = tf.nn.softmax(
        tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)

In [None]:
num_steps = 3001
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])
with tf.compat.v1.Session(graph=graph) as session:
  tf.compat.v1.initialize_all_variables().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
      print ("============================")
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 290.664185
Minibatch accuracy: 7.0%
Validation accuracy: 10.0%
Minibatch loss at step 500: 197.119598
Minibatch accuracy: 63.3%
Validation accuracy: 42.0%
Minibatch loss at step 1000: 144.398346
Minibatch accuracy: 78.9%
Validation accuracy: 52.7%
Minibatch loss at step 1500: 4.747934
Minibatch accuracy: 55.5%
Validation accuracy: 68.5%
Minibatch loss at step 2000: 4.467845
Minibatch accuracy: 78.9%
Validation accuracy: 76.3%
Minibatch loss at step 2500: 0.547467
Minibatch accuracy: 91.4%
Validation accuracy: 79.9%
Minibatch loss at step 3000: 2.143633
Minibatch accuracy: 89.1%
Validation accuracy: 82.3%
Test accuracy: 80.9%
