## Download and extract files

In [2]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import json

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline
plt.style.use('ggplot')



Download the original, variable-resolution, color house-number images with character level bounding boxes as show in the Street View House Numbers Dataset SVHN. The bounding box information are stored in digitStruct.mat instead of drawn directly on the images in the dataset.

In [3]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, expected_bytes, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    print('Failed to verify ' + filename + '. Please delete and try again')
  return filename

In [5]:
train_filename = maybe_download('train.tar.gz', 404141560)
test_filename = maybe_download('test.tar.gz', 276555967)
extra_filename = maybe_download('extra.tar.gz', 0)

Found and verified train.tar.gz
Found and verified test.tar.gz
Failed to verify extra.tar.gz. Please delete and try again


In [6]:
maybe_download('train_32x32.mat', 0)
maybe_download('test_32x32.mat', 0)
maybe_download('extra_32x32.mat', 0)

Failed to verify train_32x32.mat. Please delete and try again
Failed to verify test_32x32.mat. Please delete and try again
Failed to verify extra_32x32.mat. Please delete and try again


'extra_32x32.mat'

In [7]:
num_classes = 10
np.random.seed(133)

def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()
  if not os.path.exists(root+'/digitStruct.mat'):
    print("digitStruct.mat is missing")
  return root+'/digitStruct.mat'
  
train_struct = maybe_extract(train_filename)
test_struct = maybe_extract(test_filename)
# extra_struct = maybe_extract(extra_filename)

train already present - Skipping extraction of train.tar.gz.
test already present - Skipping extraction of test.tar.gz.


In [8]:
extra_struct=maybe_extract(extra_filename)

Extracting data for extra. This may take a while. Please wait.


IOError: [Errno 28] No space left on device

In [None]:
pixel_depth =255.0 # number of levels per pixel
screen_width=800
screen_height=600

In [None]:
import h5py
c=h5py.file(train/digitStruct.mat)

In [None]:
d=c[c['digitStruct']['bbox'][0][0]]['label'].value.squeeze()
#label of first image
np.array([c[x].value for x in d]).squeeze()

In [None]:
d=c[c['digitStruct']['bbox'][0][0]]['top'].value.squeeze()
np.array([c[x].value for x in d]).squeeze()

In [None]:
#filename of first image, then .replace('\x00','')
c[c['digitStruct']['name'][0][0]].value.tostring()

In [None]:
import h5py # a Pythonic interface to the HDF5 binary data format

def get_attr(c,i,attr):
    d=c[c['digitStruct']['bbox'][i][0]][attr].value.squeeze()
    if d.dtype=='float64':
        return d.reshape(-1)
    return np.array([c[x].value for x in d]).squeeze()

def load_data(path):
    c = h5py.File(path)
    images = a = np.ndarray(shape=(c['digitStruct']['name'].shape[0], ), dtype='|S15')
    labels = np.zeros((len(c['digitStruct']['bbox']), 6), dtype=float)
    labels.fill(10)
    tops = np.zeros((len(c['digitStruct']['bbox']), 6), dtype=float)
    heights = np.zeros((len(c['digitStruct']['bbox']), 6), dtype=float)
    widths = np.zeros((len(c['digitStruct']['bbox']), 6), dtype=float)
    lefts = np.zeros((len(c['digitStruct']['bbox']), 6), dtype=float)
    for i in xrange(c['digitStruct']['name'].shape[0]):
        images[i] = get_filename(c, i)
        y = get_attr(c, i, 'label')
        t = get_attr(c, i, 'top')
        h = get_attr(c, i, 'height')
        w = get_attr(c, i, 'width')
        l = get_attr(c, i, 'left')
        labels[i, :y.shape[0]] = y
        tops[i, :t.shape[0]] = t
        heights[i, :h.shape[0]] = h
        widths[i, :w.shape[0]] = w
        lefts[i, :l.shape[0]] = l
        if (i % 5000 == 0):
            print(i, "elapsed")
    return labels, images, tops, heights, widths, lefts

In [None]:
train__tuple = load_data('train/digitStruct.mat')
test__tuple = load_data('test/digitStruct.mat')
extra__tuple = load_data('extra/digitStruct.mat')

In [None]:
def maybe_pickle(struct, force=False):
    if os.path.exists(struct + '.pickle') and not force:
        # You may override by setting force=True
        print('%s already present - Skipping pickling.' % struct)
    else:
        print('Pickling %s.' % struct + '.pickle')
        permutation = np.random.permutation(extra__tuple[1].shape[0])[:2000]
        attrs = ['labels', 'images', 'tops', 'heights', 'widths', 'lefts']
       
        d_train = {}
        d_test = {}
        d_extra = {}
        d_valid = {}
        for i, attr in enumerate(attrs):
            d_train[attr] = train__tuple[i]
            d_test[attr] = test__tuple[i]
            d_extra[attr] = extra__tuple[i]
            d_valid[attr] = extra__tuple[i][permutation]
            
        dataset = {'train': d_train, 'test': d_test, 
                   'extra': d_extra, 'valid': d_valid}
        try:
            with open(struct + '.pickle', 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', struct + '.pickle', ':', e)
    return struct + '.pickle'

In [None]:
extra_tuple[0][[1,2,3,4]] #labels for 2.png to 5.png

In [None]:
display(Image(filename='extra/5.png'))

In [None]:
maybe_pickle('svhn')

## Data Exploration
- features and calculated statistics relevant to the problem
- a sampling of the data
- abnomilities, characteristics about the data

In [None]:
with open('svhn.pickle','rb') as f:
    dataset=pickle.load(f)

In [None]:
from collections import Counter
# number of digits in each image
def count_number_digits(source):
    c=Counter(np.sum(dataset[source]['tops']>0,axis=1))
    l=plt.bar(c.keys(),c.values(),1)
    plt.xlabel('Number of characters')
    plt.ylabel('Sample size')
    plt.grid(True)
    plt.title(soruce+'dataset')
    plt.show()

count_number_digits('train')
count_number_digits('test')
count_number_digits('extra')