In [1]:
import pandas as pd
import json, os, sys, glob
import numpy as np
import h5py
import tensorflow as tf
from IPython.display import Image,display
import matplotlib.pyplot as plt
import tarfile
from six.moves.urllib.request import urlretrieve

# 1. Download and decompress dataset

In [2]:
def may_create_folder(folder):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    print("Folder %s present now" % folder)

#may_create_folder("./test")

In [3]:
download_site = "http://ufldl.stanford.edu/housenumbers/"
data_root = "./data"
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent

def may_download(tar_file_name, expected_size, force = False):
    dest_filename = os.path.join(data_root, tar_file_name)
    if force or not os.path.exists(dest_filename):
        print("Attempt to download file " + tar_file_name)
        file_name, _ = urlretrieve( download_site + tar_file_name, dest_filename, reporthook=download_progress_hook)
        print("Download complete!")
    statinfo = os.stat(dest_filename)
    if statinfo.st_size == expected_size:
        print("Downloaded and verified file : " + dest_filename)
    else:
        raise Exception("Failed to verify file : " + dest_filename)
    return dest_filename
    
may_create_folder(data_root)
test_filename = may_download("test.tar.gz", 276555967)
train_filename = may_download("train.tar.gz", 404141560)

Folder ./data present now
Downloaded and verified file : ./data/test.tar.gz
Downloaded and verified file : ./data/train.tar.gz


In [4]:
import os, tarfile

def maybe_extract(filename, force = False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]
    if os.path.isdir(root) and not force:
        print("{0} already present --- Skipping extraction of {1}".format(root, filename))
    else:
        print("Extract data for {0}, this may take a while, please wait ..".format(filename))
        tar = tarfile.open(filename)
        tar.extractall()
        tar.close()
    return  root

train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
        

./data/train already present --- Skipping extraction of ./data/train.tar.gz
./data/test already present --- Skipping extraction of ./data/test.tar.gz


# 2. Transform .mat file to Json file
Use the python script taken from http://www.a2ialab.com/lib/exe/fetch.php?media=public:scripts:svhn_dataextract_tojson.py.txt

In [5]:
digit_struct_files = glob.glob('./data/*/*.mat')
print(digit_struct_files)

['./data/train/digitStruct.mat', './data/test/digitStruct.mat']


In [6]:
jsons_root = './jsons'

def transform_mat_to_json():
    transform_json_script = "python svhn_dataextract_tojson.py -f ./data/{0}/digitStruct.mat -o ./jsons/{0}/digitStruct"
    for file_type in ['train', 'test']:
        output_folder = os.path.join(jsons_root, file_type)
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        if os.path.exists(os.path.join(output_folder, 'digitStruct.json')):
            continue
        print("Begin transform %s files" % file_type)
        os.system(transform_json_script.format(file_type))
        print("End transform %s files" % file_type)
    print("Complete !")
    
may_create_folder(jsons_root)
transform_mat_to_json()    

Folder ./jsons present now
Complete !


In [9]:
def get_json_information(json_file):
    '''
    
    :param json_file: path of json files
    :return: 
      result format : [{'filename':'*.png', 'bbox_x1': value, 'bbox_y1':value, 'bbox_width':value, 'bbox_height': value,
      'boxes':np.array([{'left':value, 'top':value, 'width':value, 'height':value}, ..]), labels:np.array([val1, val2,..])}, ...]
      'filename' : name of image file
      'bbox_x1' : left coordinate of bounding box
      'bbox_y1' : top coordinate of bounding box
      'bbox_width : width of bounding box
      'bbox_height' : height of bounding box
      'boxes' : all box in the image
      'boxes'.'left': left coordinate of one box
      'boxes'.'top' : top coordinate of one box
      'boxes'.'widht' : width of one box
      'boxes'.'height' : height of one box
    '''
    if not os.path.exists(json_file):
        raise Exception("{0} is not exist".format(json_file))

    with open(json_file, 'r') as file:
        json_info = json.load(file)
    results = []
    for info in json_info:
        digit_info = {}
        digit_info['filename'] = info['filename']

        boxes = info['boxes']
        x1 = int(np.min([box['left'] for box in boxes]))
        y1 = int(np.min([box['top'] for box in boxes]))
        x2 = int(np.max([box['left'] + box['width'] for box in boxes]))
        y2 = int(np.max([box['top'] + box['height'] for box in boxes]))

        digit_info['bbox_x1'] = x1
        digit_info['bbox_y1'] = y1
        digit_info['boxes'] = np.array([{ name : int(box[name]) for name in ['left', 'top', 'width', 'height']}  for box in boxes])
        digit_info['bbox_width'] = x2 - x1
        digit_info['bbox_height'] = y2 - y1
        digit_info['labels'] = np.array([int(box['label']) for box in boxes])
        results.append(digit_info)
    return results

train_json_file_path = os.path.join(jsons_root, 'train/digitStruct.json')
test_json_file_path = os.path.join(jsons_root, 'test/digitStruct.json')
train_json_info = get_json_information(train_json_file_path)
test_json_info = get_json_information(test_json_file_path)
print("Train json info : \n", train_json_info[0:5])
print("=" * 50)
print("Test json info : \n", test_json_info[0:5])

Train json info : 
 [{'filename': '1.png', 'bbox_x1': 246, 'bbox_y1': 77, 'boxes': array([{'left': 246, 'top': 77, 'width': 81, 'height': 219},
       {'left': 323, 'top': 81, 'width': 96, 'height': 219}], dtype=object), 'bbox_width': 173, 'bbox_height': 223, 'labels': array([1, 9])}, {'filename': '2.png', 'bbox_x1': 77, 'bbox_y1': 25, 'boxes': array([{'left': 77, 'top': 29, 'width': 23, 'height': 32},
       {'left': 98, 'top': 25, 'width': 26, 'height': 32}], dtype=object), 'bbox_width': 47, 'bbox_height': 36, 'labels': array([2, 3])}, {'filename': '3.png', 'bbox_x1': 17, 'bbox_y1': 5, 'boxes': array([{'left': 17, 'top': 5, 'width': 8, 'height': 15},
       {'left': 25, 'top': 5, 'width': 9, 'height': 15}], dtype=object), 'bbox_width': 17, 'bbox_height': 15, 'labels': array([2, 5])}, {'filename': '4.png', 'bbox_x1': 57, 'bbox_y1': 13, 'boxes': array([{'left': 57, 'top': 13, 'width': 15, 'height': 34},
       {'left': 72, 'top': 13, 'width': 13, 'height': 34}], dtype=object), 'bbox_wi

In [None]:
from IPython.draw 

## Load and analyze data first
1. Load mat data and transform to json

In [None]:
digit_struct_file = digit_struct_files[0]
#digit = sio.loadmat(digit_struct_file)
# use scipy.io at first, but "NotImplementedError: Please use HDF reader for matlab v7.3 files" happens,
# after google, use h5py instead
print(digit_struct_file)
mat_file = h5py.File(digit_struct_file, 'r')
mat_file

In [None]:
print(len(mat_file))
for mat in mat_file:
    print(mat)

In [None]:
digitStructName = mat_file['digitStruct']['name']
digitStructBbox = mat_file['digitStruct']['bbox']

In [None]:
print(''.join([chr(c) for c in mat_file[digitStructName[0][0]].value]))

In [None]:
print('')