Deep Learning
=============

Here we attempt to load, explore and clean if required the data from street view housing numbers dataset. 

In [5]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from scipy import io
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import pandas as pd

# Config the matlotlib backend as plotting inline in IPython
%matplotlib inline

In [32]:
fileName = 'train/digitStruct.csv'

df = pd.read_csv(fileName)
df['LabelCount'] = df.DigitLabel
df.DigitLabel = df['DigitLabel'].replace(10, 0)
# right and bottom most offsets
df['RightMost'] = df.Left + df.Width
df['BottomMost'] = df.Top + df.Height

grouped = df.groupby(['FileName'])
agg = grouped.agg({'Left': np.min, 'Top': np.min, 'BottomMost': np.max, 'RightMost': np.max, 'LabelCount': np.size, 'DigitLabel': lambda x: tuple(x)})
agg['Height'] = agg.BottomMost - agg.Top
agg['Width'] = agg.RightMost - agg.Left
agg.head()

Unnamed: 0_level_0,Top,RightMost,DigitLabel,Left,BottomMost,LabelCount,Height,Width
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.png,77,419,"(1, 9)",246,300,2,223,173
10.png,4,48,"(1, 6)",25,33,2,29,23
100.png,0,42,"(1, 0, 9)",18,23,3,23,24
1000.png,1,27,"(1, 3)",17,19,2,18,10
10000.png,20,88,"(1, 5, 3)",45,52,3,32,43


In [42]:
image_size = 32  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def load_letter(folder):
  """Load the data for a single letter label."""
  image_files = os.listdir(folder)
  dataset = np.ndarray(shape=(len(image_files), image_size, image_size, 3),
                         dtype=np.float32)
  # length and 5 digits - 10 to signify absence of a digit in sequence
  labels = np.ndarray(shape=(len(image_files), 6), dtype=np.int8)
  print(folder)
  num_images = 0
  for image in image_files:
    image_file = os.path.join(folder, image)
    try:
      # first - label data
      label_data = agg.loc[image]
      numbers = list(label_data.DigitLabel)
      # if length of the numbers is less than 5, pad it with 10s
      if len(numbers) < 5:
        numbers += [10]*(5-len(numbers))
      elif len(numbers) > 5:
        continue # skip this image. number of digits more than 5
      
      length = label_data.LabelCount
      labels[num_images, :] = np.array([length] + numbers)
    
      # now image data
      image_data = (ndimage.imread(image_file).astype(float) - 
                    pixel_depth / 2) / pixel_depth
      if image_data.shape != (image_size, image_size, 3):
        raise Exception('Unexpected image shape: %s' % str(image_data.shape))
      dataset[num_images, :, :, :] = image_data        
    
      # Increment the index counter
      num_images = num_images + 1
      
    except IOError as e:
      print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
    
  dataset = dataset[0:num_images, :, :, :]
  labels = labels[0:num_images, :]
  
  print('Full dataset tensor:', dataset.shape)
  print('Full labels tensor:', labels.shape)
  print('Mean:', np.mean(dataset))
  print('Standard deviation:', np.std(dataset))
  return (dataset, labels)

train_dataset, labels = load_letter('train/proc/resized')

train/proc/resized
Full dataset tensor: (23468, 32, 32, 3)
Full labels tensor: (23468, 6)
Mean: 0.0463026
Standard deviation: 0.239705


In [44]:
labels[1]

array([ 3,  6,  5,  9, 10, 10], dtype=int8)