In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from scipy.misc import imresize
import string
import h5py
import matplotlib
from collections import Counter

%matplotlib inline

pixel_depth = 255.0

In [2]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

In [3]:
def download_progress_hook(count, blockSize, totalSize):
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)
    
    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()
                
        last_percent_reported = percent

In [4]:
def maybe_download(filename, expected_bytes, force=False):
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
        return filename

In [5]:
maybe_download('train.tar.gz', 404141560)
maybe_download('test.tar.gz', 276555967)

train_filename = 'train.tar.gz'
test_filename = 'test.tar.gz'

Found and verified train.tar.gz
Found and verified test.tar.gz


In [6]:
num_classes = 10
np.random.seed(42)

In [7]:
def maybe_extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    if os.path.isdir(root) and not force:
        # You may override by setting force=True
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
    data_folders = [
        os.path.join(root, d) for d in sorted(os.listdir(root))
        if os.path.isdir(os.path.join(root, d))]
    return data_folders

In [8]:
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)

train already present - Skipping extraction of train.tar.gz.
test already present - Skipping extraction of test.tar.gz.


In [9]:
def get_attr (f, i, name):
    attr = f[f['digitStruct']['bbox'][i][0]][name].value.squeeze()
    
    if attr.dtype == 'float64':
        return attr.reshape(-1)
    
    return np.array([f[x].value for x in attr]).squeeze()

In [10]:
def get_label (f, i):
    label = f[f['digitStruct']['name'][i][0]].value.tostring()
    return label.replace('\x00', '')

In [11]:
def load_data (path):
    f = h5py.File(path)
    
    images = np.ndarray(shape = (f['digitStruct']['name'].shape[0],), dtype = '|S15')
    labels = np.zeros((len(f['digitStruct']['bbox']), 6), dtype = 'float')
    tops = np.zeros((len(f['digitStruct']['bbox']), 6), dtype = 'float')
    heights = np.zeros((len(f['digitStruct']['bbox']), 6), dtype = 'float')
    widths = np.zeros((len(f['digitStruct']['bbox']), 6), dtype = 'float')
    lefts = np.zeros((len(f['digitStruct']['bbox']), 6), dtype = 'float')
    labels.fill(10)
    
    for i in xrange(f['digitStruct']['name'].shape[0]):
        images[i] = get_label(f, i)
        
        label_attr = get_attr(f, i, 'label')
        top_attr = get_attr(f, i, 'top')
        height_attr = get_attr(f, i , 'height')
        width_attr = get_attr(f, i, 'width')
        left_attr = get_attr(f, i , 'left')
        
        labels[i, :label_attr.shape[0]] = label_attr
        tops[i, :top_attr.shape[0]] = top_attr
        heights[i, :height_attr.shape[0]] = height_attr
        widths[i, :width_attr.shape[0]] = width_attr
        lefts[i, :left_attr.shape[0]] = left_attr
        
        if (i % 5000 == 0):
            print (i, 'passed')
        
    return labels, images, tops, heights, widths, lefts

In [12]:
train_tuple = load_data('train/digitStruct.mat')
test_tuple = load_data('test/digitStruct.mat')

0 passed
5000 passed
10000 passed
15000 passed
20000 passed
25000 passed
30000 passed
0 passed
5000 passed
10000 passed


In [13]:
def maybe_pickle (struct, force=False):
    if os.path.exists(struct + '.pickle') and not force:
        print('file already present, skipping')
    else:
        print('pickling file')
        
        dataset = {
            'train': {
                'labels': train_tuple[0],
                'images': train_tuple[1],
                'tops': train_tuple[2],
                'heights': train_tuple[3],
                'widths': train_tuple[4],
                'lefts': train_tuple[5]
            },
            'test': {
                'labels': test_tuple[0],
                'images': test_tuple[1],
                'tops': test_tuple[2],
                'heights': test_tuple[3],
                'widths': test_tuple[4],
                'lefts': test_tuple[5]
            }
        }
        
        try:
            with open (struct + '.pickle', 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print ('unable to save data', e)
            
    return struct + '.pickle'

In [14]:
maybe_pickle('svhn')

file already present, skipping


'svhn.pickle'

In [15]:
try:
    with open ('svhn.pickle', 'rb') as f:
        data = pickle.load(f)
except Exception as e:
    print ('unable to process data', e)
    raise

In [16]:
def load_image (image_file, path='train/', **box):
    image_data = np.average(ndimage.imread(path+image_file), axis=2)
    
    if box['minTop'] <= 0: box['minTop'] = 0
    if box['minLeft'] <= 0: box['minLeft'] = 0
        
    image_data = image_data[box['minTop']:box['maxTopHeight'], box['minLeft']:box['maxLeftWidth']]
    image_data = imresize(image_data, (32,32))
    image_data = (image_data.astype(float) - pixel_depth / 2) / pixel_depth
    
    return image_data

In [17]:
def load_images (data, struct):
    images = data[struct]['images']
    tops = data[struct]['tops']
    widths = data[struct]['widths']
    heights = data[struct]['heights']
    lefts = data[struct]['lefts']
    
    final_data = np.ndarray (shape = (images.shape[0], 32, 32), dtype=np.float32)
    
    for i in range (final_data.shape[0]):
        if (i % 5000 == 0):
            print (i, 'passed out of', final_data.shape[0], 'for:', struct)
        try:
            path = struct + '/'
            charCount = data[struct]['labels'][i][data[struct]['labels'][i] > - 1].shape[0]
            topHeights = np.array([tops[i][:charCount], heights[i][:charCount]])
            leftWidths = np.array([lefts[i][:charCount], widths[i][:charCount]])
            image = load_image (images[i], path, **{
                    "minTop": min(topHeights[0,:]),
                    "minLeft": min(leftWidths[0,:]),
                    "maxTopHeight": topHeights.sum(axis=0).max(),
                    "maxLeftWidth": leftWidths.sum(axis=0).max()
                })
            final_data[i,:,:] = image
        except Exception as e:
            img = np.average(ndimage.imread(path+images[i]), axis=2)
            print (i, charCount, img.shape, {
                    "minTop": min(topHeights[0,:]),
                    "minLeft": min(leftWidths[0,:]),
                    "maxTopHeight": topHeights.sum(axis=0).max(),
                    "maxLeftWidth": leftWidths.sum(axis=0).max(),
                    "lefts": lefts[i],
                    "widths": widths[i],
                    "message": e.message
                })
            return
    return final_data

In [18]:
X_train = load_images(data, 'train')
X_test = load_images(data, 'test')
y_train = data['train']['labels']
y_test = data['test']['labels']

0 passed out of 33402 for: train




5000 passed out of 33402 for: train
10000 passed out of 33402 for: train
15000 passed out of 33402 for: train
20000 passed out of 33402 for: train
25000 passed out of 33402 for: train
30000 passed out of 33402 for: train
0 passed out of 13068 for: test
5000 passed out of 13068 for: test
10000 passed out of 13068 for: test


In [19]:
try:
    with open ('tensorflow_data.pickle', 'wb') as f:
        pickle.dump({
                'train': {'data': X_train, 'label': y_train},
                'test': {'data': X_test, 'label': y_test}
            }, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print ('unable to save data', struct, e)