In [1]:
import os
import struct
import numpy as np
import zipfile
import tables as tb
import pickle


class Bunch(dict):
    def __init__(self, *args, **kwds):
        super().__init__(*args, **kwds)
        self.__dict__ = self

            
class GNT:
    
    def __init__(self, buffer):
        self.buffer = buffer
          
    def __iter__(self):
        j = 0
        for i, _ in enumerate(self.buffer):
            if i == j:
                size = np.frombuffer(self.buffer[j : j + 4], 'I')[0]
                tag = self.buffer[j + 4: j + 6].decode('gb18030')
                width, height = np.frombuffer(self.buffer[j + 6: j + 10], 'H')
                bitmap = np.frombuffer(self.buffer[j + 10: j + size], 'B').reshape((height, width))
                j += size
                yield {tag : bitmap}

                
class HW(Bunch):
    
    def __init__(self, root, filename, *args, **kwds):
        super().__init__(*args, **kwds)
        self._type(filename)
        path = f'{root}{filename}'
        self.Z = zipfile.ZipFile(path)
        
    def _type(self, filename):
        if 'gnt' in filename:
            self['type'] = 'gnt'
        elif 'pot' in filename:
            self['type'] = 'pot'
        else:
            self['type'] = 'mpf'

                
class Feature(Bunch):

    def __init__(self, root, *args, **kwds):
        super().__init__(*args, **kwds)
        for filename in os.listdir(root):
            name, _ = os.path.splitext(filename)
            name = name.replace('.', '')
            hw = HW(root, filename)
            self[name] = hw
            
            
class DataSet(Bunch):
        
    def __init__(self, dataset, *args, **kwds):
        super().__init__(*args, **kwds)
        for info in dataset.Z.infolist():
            if not info.is_dir():
                name = 'writer' + os.path.splitext(info.filename)[0]
                name = name.replace('-', '_').replace('.', '')
                buffer = dataset.Z.read(info)
                
                if dataset.type == 'gnt':
                    self[name] = self._gnt(buffer)
            
    def _gnt(self, buffer):
        gnt = GNT(buffer)
        X = np.asanyarray([item for item in iter(gnt)])
        return X

In [2]:
temp = 'D:/temp/'

root = 'E:/OCR/CASIA/HW/'

In [3]:
%%time
ft = Feature(root)
dataset = ft.HWDB11trn_gnt

Wall time: 26.7 ms


In [4]:
%%time
D = DataSet(dataset)

Wall time: 27min 11s


In [15]:
D.writer1001_c.shape

(3749,)