In [1]:
import os

root = 'E:/OCR/CASIA/data'  # CASI 数据集所在根目录
os.listdir(root)

['HWDB1.0trn.zip',
 'HWDB1.0trn_gnt.zip',
 'HWDB1.0tst.zip',
 'HWDB1.0tst_gnt.zip',
 'HWDB1.1trn.zip',
 'HWDB1.1trn_gnt.zip',
 'HWDB1.1tst.zip',
 'HWDB1.1tst_gnt.zip',
 'OLHWDB1.0test_pot.zip',
 'OLHWDB1.0train_pot.zip',
 'OLHWDB1.0trn.zip',
 'OLHWDB1.0tst.zip',
 'OLHWDB1.1trn.zip',
 'OLHWDB1.1trn_pot.zip',
 'OLHWDB1.1tst.zip',
 'OLHWDB1.1tst_pot.zip']

In [2]:
import os
import sys
import zipfile
import struct
import pandas as pd
import numpy as np
import tables as tb
import time

z = zipfile.ZipFile(f'{root}/HWDB1.1trn_gnt.zip')
z.namelist()

['1001-c.gnt',
 '1002-c.gnt',
 '1003-c.gnt',
 '1004-c.gnt',
 '1005-c.gnt',
 '1006-c.gnt',
 '1007-c.gnt',
 '1008-c.gnt',
 '1009-c.gnt',
 '1010-c.gnt',
 '1011-c.gnt',
 '1012-c.gnt',
 '1013-c.gnt',
 '1014-c.gnt',
 '1015-c.gnt',
 '1016-c.gnt',
 '1017-c.gnt',
 '1018-c.gnt',
 '1019-c.gnt',
 '1020-c.gnt',
 '1021-c.gnt',
 '1022-c.gnt',
 '1023-c.gnt',
 '1024-c.gnt',
 '1025-c.gnt',
 '1026-c.gnt',
 '1027-c.gnt',
 '1028-c.gnt',
 '1029-c.gnt',
 '1030-c.gnt',
 '1031-c.gnt',
 '1032-c.gnt',
 '1033-c.gnt',
 '1034-c.gnt',
 '1035-c.gnt',
 '1036-c.gnt',
 '1037-c.gnt',
 '1038-c.gnt',
 '1039-c.gnt',
 '1040-c.gnt',
 '1041-c.gnt',
 '1042-c.gnt',
 '1043-c.gnt',
 '1044-c.gnt',
 '1045-c.gnt',
 '1046-c.gnt',
 '1047-c.gnt',
 '1048-c.gnt',
 '1049-c.gnt',
 '1050-c.gnt',
 '1051-c.gnt',
 '1052-c.gnt',
 '1053-c.gnt',
 '1054-c.gnt',
 '1055-c.gnt',
 '1056-c.gnt',
 '1057-c.gnt',
 '1058-c.gnt',
 '1059-c.gnt',
 '1060-c.gnt',
 '1061-c.gnt',
 '1062-c.gnt',
 '1063-c.gnt',
 '1064-c.gnt',
 '1065-c.gnt',
 '1066-c.gnt',
 '1067-c.g

In [3]:
%matplotlib inline
from matplotlib import pyplot as plt

In [4]:
class GNT:
    # GNT 文件的解码器
    def __init__(self, Z, set_name):
        self.Z = Z
        self.set_name = set_name # 数据集名称
    def __iter__(self):
        with self.Z.open(self.set_name) as fp:
            head = True
            while head:
                head = fp.read(4)
                if not head:
                    break
                head = struct.unpack('l', head)[0]
                tag_code = fp.read(2).decode('gb2312-80')
                width, height = struct.unpack('2h', fp.read(4))
                bitmap = np.frombuffer(fp.read(width*height), np.uint8)
                img = bitmap.reshape((height, width))
                yield img, tag_code

In [5]:
set_name = '1071-c.gnt'
gnt = GNT(z, set_name)

In [6]:
dataset = [(img, label) for img, label in gnt]

In [7]:
dataset

[(array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 222, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]], dtype=uint8), '墨'),
 (array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]], dtype=uint8), '默'),
 (array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         ...,
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255],
         [255, 255, 255, ..., 255, 255, 255]], dtype=uint8), '沫'),
 (array([[255, 255, 255, ..., 255, 255, 255],
         [255, 255, 2

In [8]:
class MPF:
    # MPF 文件的解码器
    def __init__(self, fp):
        self._fp = fp
        # 解码文件头
        header_size = struct.unpack('l', self._fp.read(4))[0]
        # 文件保存的形式，如 “MPF”
        self.code_format = self._fp.read(8).decode('ascii').rstrip('\x00')
        # 文本说明
        self.text = self._fp.read(header_size - 62).decode().rstrip('\x00')
        # 编码类型，如 “ASCII”, “GB”, etc
        self.code_type = self._fp.read(20).decode('latin-1').rstrip('\x00')
        # 编码长度
        self.code_length = struct.unpack('h', self._fp.read(2))[0]
        self.dtype = self._fp.read(20).decode('ascii').rstrip('\x00')
        if self.dtype == 'unsigned char':
            self.dtype = np.uint8
        else:
            self.dtype = np.dtype(self.data_type)
        # 样本数
        self.nrows = struct.unpack('l', self._fp.read(4))[0]
        # 特征的维度
        self.ndims = struct.unpack('l', self._fp.read(4))[0]

    def __iter__(self):
        m = self.code_length + self.ndims
        for i in range(0, m * self.nrows, m):
            # 样本的标签
            label = self._fp.read(self.code_length).decode('gb2312-80')
            # 样本的特征
            data = np.frombuffer(self._fp.read(self.ndims), self.dtype)
            yield data, label

In [9]:
z = zipfile.ZipFile(f'{root}/HWDB1.0trn.zip')
z.namelist()[1:5]

['HWDB1.0trn/001.mpf',
 'HWDB1.0trn/002.mpf',
 'HWDB1.0trn/003.mpf',
 'HWDB1.0trn/004.mpf']

In [10]:
fp = z.open('HWDB1.0trn/001.mpf') # 查看第一个写手
mpf = MPF(fp) # 解码
df = pd.DataFrame.from_records([(label, data) for data, label in mpf])
# 将 records 转换为 pd.DataFrame
df = pd.DataFrame(data=np.column_stack(df[1]).T, index=df[0])
df.head()  # 查看前 5 个字

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
扼,11,11,0,3,8,1,0,0,16,4,...,0,0,41,7,8,4,14,10,0,0
遏,5,8,25,2,2,22,7,0,32,20,...,10,0,20,28,24,18,9,5,3,0
鄂,27,10,5,35,7,1,8,12,57,25,...,1,0,9,7,2,0,5,15,1,0
饿,21,1,1,11,2,2,3,13,34,1,...,25,3,16,1,15,20,11,2,35,17
恩,26,4,2,4,1,26,12,0,60,9,...,1,3,9,5,42,31,5,0,4,3


In [11]:
mpf.text

'Character features extracted from grayscale images. #ftrtype=ncg, #norm=ldi, #aspect=4, #dirn=8, #zone=8, #zstep=8, #fstep=8, $deslant=0, $smooth=0, $nmdir=0, $multisc=0'

In [12]:
class Bunch(dict):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__dict__ = self

In [13]:
mb = Bunch()
for name in z.namelist():
    if name.endswith('.mpf'):
        writer_ = f"writer{os.path.splitext(name)[0].split('/')[1]}"
        with z.open(name) as fp:
            mpf = MPF(fp)
            df = pd.DataFrame.from_records([(label, data) for data, label in mpf])
            df = pd.DataFrame(data=np.column_stack(df[1]).T, index=df[0])
            mb[writer_] = Bunch({"text":mpf.text, 'features':df})

In [17]:
mb.writer001.features.index

Index(['扼', '遏', '鄂', '饿', '恩', '而', '儿', '耳', '尔', '饵',
       ...
       '堕', '蛾', '峨', '鹅', '俄', '额', '讹', '娥', '恶', '厄'],
      dtype='object', name=0, length=3728)

In [20]:
mb.writer001.text

'Character features extracted from grayscale images. #ftrtype=ncg, #norm=ldi, #aspect=4, #dirn=8, #zone=8, #zstep=8, #fstep=8, $deslant=0, $smooth=0, $nmdir=0, $multisc=0'

In [21]:

for writer_id in mb.keys():
    mb[writer_id].features.to_hdf(save_path, key = writer_id, complevel = 7, mode = 'a')

In [52]:
feature_paths = {
    os.path.splitext(name)[0].replace('.', ''):
    f'{root}/{name}' for name in os.listdir(root) if '_' not in name}
feature_paths

{'HWDB10trn': 'E:/OCR/CASIA/data/HWDB1.0trn.zip',
 'HWDB10tst': 'E:/OCR/CASIA/data/HWDB1.0tst.zip',
 'HWDB11trn': 'E:/OCR/CASIA/data/HWDB1.1trn.zip',
 'HWDB11tst': 'E:/OCR/CASIA/data/HWDB1.1tst.zip',
 'OLHWDB10trn': 'E:/OCR/CASIA/data/OLHWDB1.0trn.zip',
 'OLHWDB10tst': 'E:/OCR/CASIA/data/OLHWDB1.0tst.zip',
 'OLHWDB11trn': 'E:/OCR/CASIA/data/OLHWDB1.1trn.zip',
 'OLHWDB11tst': 'E:/OCR/CASIA/data/OLHWDB1.1tst.zip'}

In [None]:
def MPF2Bunch(Z):
    # 将 MPF 转换为 bunch
    mb = Bunch()
    for name in Z.namelist():
        if name.endswith('.mpf'):
            writer_ = f"writer{os.path.splitext(name)[0].split('/')[1]}"
            with Z.open(name) as fp:
                mpf = MPF(fp)
                df = pd.DataFrame.from_records([(label, data) for data, label in mpf])
                df = pd.DataFrame(data=np.column_stack(df[1]).T, index=df[0])
                mb[writer_] = Bunch({"text":mpf.text, 'features':df})
    return mb

In [63]:
h.close()

In [31]:
h.root.writer001['axis0']

/writer001/axis0 (CArray(512,), shuffle, zlib(7)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := (8192,)

In [33]:
h.root.writer001[ 'block0_items']

/writer001/block0_items (CArray(512,), shuffle, zlib(7)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := (8192,)

In [None]:
import pickle
def bunch2json(bunch, path):
    # bunch 序列化为 JSON
    with open(path, 'wb') as fp:
        pickle.dump(bunch, fp)


def json2bunch(path):
    # JSON 反序列化为 bunch
    with open(path, 'rb') as fp:
        X = pickle.load(fp)
    return X

In [None]:
path = 'E:/OCR/CASIA/datasets/features.json'
bunch2json(mb, path)
X = json2bunch(path)

In [None]:
X