In [1]:
import pandas as pd
import json, os, sys, glob
import numpy as np
import h5py
import tensorflow as tf
from IPython.display import Image,display
import matplotlib.pyplot as plt
import tarfile
from six.moves.urllib.request import urlretrieve

# 1. Download and decompress dataset

In [2]:
def may_create_folder(folder):
    if not os.path.isdir(folder):
        os.mkdir(folder)
    print("Folder %s present now" % folder)

#may_create_folder("./test")

In [3]:
download_site = "http://ufldl.stanford.edu/housenumbers/"
data_root = "./data"
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 5% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent

def may_download(tar_file_name, expected_size, force = False):
    dest_filename = os.path.join(data_root, tar_file_name)
    if force or not os.path.exists(dest_filename):
        print("Attempt to download file " + tar_file_name)
        file_name, _ = urlretrieve( download_site + tar_file_name, dest_filename, reporthook=download_progress_hook)
        print("Download complete!")
    statinfo = os.stat(dest_filename)
    if statinfo.st_size == expected_size:
        print("Downloaded and verified file : " + dest_filename)
    else:
        raise Exception("Failed to verify file : " + dest_filename)
    return dest_filename
    
may_create_folder(data_root)
test_filename = may_download("test.tar.gz", 276555967)
train_filename = may_download("train.tar.gz", 404141560)

Folder ./data present now
Downloaded and verified file : ./data/test.tar.gz
Downloaded and verified file : ./data/train.tar.gz


In [4]:
import os, tarfile

def maybe_extract(filename, force = False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]
    if os.path.isdir(root) and not force:
        print("{0} already present --- Skipping extraction of {1}".format(root, filename))
    else:
        print("Extract data for {0}, this may take a while, please wait ..".format(filename))
        tar = tarfile.open(filename)
        tar.extractall()
        tar.close()
    return  root

train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
        

./data/train already present --- Skipping extraction of ./data/train.tar.gz
./data/test already present --- Skipping extraction of ./data/test.tar.gz


# 2. Transform .mat file to Json file
Use the python script taken from http://www.a2ialab.com/lib/exe/fetch.php?media=public:scripts:svhn_dataextract_tojson.py.txt

In [5]:
digit_struct_files = glob.glob('./data/*/*.mat')
print(digit_struct_files)

['./data/train/digitStruct.mat', './data/test/digitStruct.mat']


In [7]:
jsons_root = './jsons'

def transform_mat_to_json():
    transform_json_script = "python svhn_dataextract_tojson.py -f ./data/{0}/digitStruct.mat -o ./jsons/{0}/digitStruct"
    for file_type in ['train', 'test']:
        output_folder = os.path.join(jsons_root, file_type)
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        if os.path.exists(os.path.join(output_folder, 'digitStruct.json')):
            continue
        print("Begin transform %s files" % file_type)
        os.system(transform_json_script.format(file_type))
        print("End transform %s files" % file_type)
    print("Complete !")
    
may_create_folder(jsons_root)
transform_mat_to_json()    

Folder ./jsons present now
Complete !


In [10]:
import json, os

def get_json_information(json_file):
    '''
    Extract information from json files
    :param json_file:  path of json file
    :return:(image_list, bounding_box_list, labels_list)
      image_list : [image1, image2, ...]
      bouding_box_list : [[x1, y1, width, height], ...]
      labels_list
    '''
    if not os.path.exists(json_file):
        raise Exception("{0} is not exist".format(json_file))
    with open(json_file, 'r') as file:
        json_info = json.load(file)
    image_list = []
    bounding_box_list = []
    labels_list = []
    for info in json_info:
        image_list.append(info['filename'])
        boxes = info['boxes']

        x1 = boxes[0]['left']
        x2 = boxes[-1]['left'] + boxes[-1]['width']
        y1 = boxes[0]['top']
        y2 = boxes[-1]['top'] + boxes[-1]['height']
        bounding_box_list.append((x1, y1, x2 - x1, y2 - y1))
        labels_list.append([int(box['label']) for box in boxes])
    return image_list, bounding_box_list, labels_list

jsons_root = './jsons'

train_json_file = os.path.join(jsons_root, 'train/digitStruct.json')
test_json_file = os.path.join(jsons_root, 'test/digitStruct.json')
train_image_list, train_bounding_box_list, train_labels_list = get_json_information(train_json_file)
test_image_list, test_bounding_box_list, test_labels_list = get_json_information(test_json_file)
print("train_image_list : ", train_image_list[0:5])
print("train_bounding_box_list : ", train_bounding_box_list[0:5])
print("train_labels_list : ", train_labels_list[0:5])
print("test_image_list : ", test_image_list[0:5])
print("test_bounding_box_list : ", test_bounding_box_list[0:5])
print("test_labels_list : ", test_labels_list[0:5])

train_image_list :  ['1.png', '2.png', '3.png', '4.png', '5.png']
train_bounding_box_list :  [(246.0, 77.0, 173.0, 223.0), (77.0, 29.0, 47.0, 28.0), (17.0, 5.0, 17.0, 15.0), (57.0, 13.0, 28.0, 34.0), (52.0, 7.0, 37.0, 49.0)]
train_labels_list :  [[1, 9], [2, 3], [2, 5], [9, 3], [3, 1]]
test_image_list :  ['1.png', '2.png', '3.png', '4.png', '5.png']
test_bounding_box_list :  [(43.0, 7.0, 19.0, 30.0), (99.0, 5.0, 34.0, 24.0), (61.0, 6.0, 11.0, 16.0), (32.0, 6.0, 14.0, 17.0), (97.0, 28.0, 19.0, 28.0)]
test_labels_list :  [[5], [2, 1, 10], [6], [1], [9]]


In [None]:
from IPython.draw 

## Load and analyze data first
1. Load mat data and transform to json

In [None]:
digit_struct_file = digit_struct_files[0]
#digit = sio.loadmat(digit_struct_file)
# use scipy.io at first, but "NotImplementedError: Please use HDF reader for matlab v7.3 files" happens,
# after google, use h5py instead
print(digit_struct_file)
mat_file = h5py.File(digit_struct_file, 'r')
mat_file

In [None]:
print(len(mat_file))
for mat in mat_file:
    print(mat)

In [None]:
digitStructName = mat_file['digitStruct']['name']
digitStructBbox = mat_file['digitStruct']['bbox']

In [None]:
print(''.join([chr(c) for c in mat_file[digitStructName[0][0]].value]))

In [None]:
print('')