In [1]:
import sys
sys.path.append('../utils')

## 单字特征的解读

In [2]:
import os
import tables as tb 

from casia_feature import MPF2Bunch, bunch2hdf
from dataset import bunch2json, json2bunch, Bunch

root = 'E:/OCR/CASIA/data'  # CASI 数据集所在根目录

feature_paths = {
    os.path.splitext(name)[0].replace('.', ''):
    f'{root}/{name}' for name in os.listdir(root) if '_' not in name}

root_dict = Bunch({
    name: MPF2Bunch(feature_paths[name]) for name in feature_paths
})

In [3]:
%%time
save_path = 'E:/OCR/CASIA/datasets/features.h5'
bunch2hdf(root_dict, save_path)

Wall time: 2min 17s


In [4]:
%%time
json_path = 'E:/OCR/CASIA/datasets/features.json'
bunch2json(root_dict, json_path)

Wall time: 1min 16s


In [5]:
%%time
h = tb.open_file(save_path)

Wall time: 115 ms


In [6]:
%%time
j = json2bunch(json_path)

Wall time: 32.9 s


In [7]:
from sys import getsizeof

source_size = 0
for path in feature_paths.values():
    source_size += os.path.getsize(path)

print("源数据文件总大小", source_size/1e9)
print("JSON Python 对象占用空间大小为：", getsizeof(j), '文件大小为', os.path.getsize(json_path)/1e9)
print("HDF5 Python 对象占用空间大小为：", getsizeof(h), '文件大小为', os.path.getsize(save_path)/1e9)

源数据文件总大小 1.718896862
JSON Python 对象占用空间大小为： 384 文件大小为 2.820918823
HDF5 Python 对象占用空间大小为： 80 文件大小为 2.775279132


In [8]:
h.root

/ (RootGroup) "Xinet's dataset"
  children := ['HWDB10trn' (Group), 'HWDB10tst' (Group), 'HWDB11trn' (Group), 'HWDB11tst' (Group), 'OLHWDB10trn' (Group), 'OLHWDB10tst' (Group), 'OLHWDB11trn' (Group), 'OLHWDB11tst' (Group)]

In [9]:
j.keys()

dict_keys(['HWDB10trn', 'HWDB10tst', 'HWDB11trn', 'HWDB11tst', 'OLHWDB10trn', 'OLHWDB10tst', 'OLHWDB11trn', 'OLHWDB11tst'])

In [10]:
h.root.HWDB10trn.writer001

/HWDB10trn/writer001 (Group) ''
  children := ['features' (Array), 'labels' (Array), 'text' (Array)]

In [11]:
j.HWDB10trn.writer001.keys()

dict_keys(['text', 'features'])

In [12]:
j.HWDB10trn.writer007.text

'Character features extracted from grayscale images. #ftrtype=ncg, #norm=ldi, #aspect=4, #dirn=8, #zone=8, #zstep=8, #fstep=8, $deslant=0, $smooth=0, $nmdir=0, $multisc=0'

In [13]:
j.HWDB10trn.writer007.features.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
邑,21,8,6,1,0,5,5,0,28,12,...,0,0,31,41,18,11,5,0,0,0
屹,2,25,9,0,7,6,1,0,26,43,...,0,1,13,2,14,46,16,1,0,0
亿,4,13,11,1,0,0,0,0,3,3,...,0,1,11,0,4,43,29,7,1,0
役,11,7,3,19,6,4,10,0,13,17,...,3,0,11,6,7,1,13,51,46,5
臆,8,2,6,3,9,5,4,0,11,3,...,1,7,6,31,15,0,14,12,6,5


In [14]:
h.root.HWDB10trn.writer007.text.read().decode()

'Character features extracted from grayscale images. #ftrtype=ncg, #norm=ldi, #aspect=4, #dirn=8, #zone=8, #zstep=8, #fstep=8, $deslant=0, $smooth=0, $nmdir=0, $multisc=0'

In [15]:
b = h.root.HWDB10trn.writer007.labels.read().decode().split(' ')
b[:5]

['邑', '屹', '亿', '役', '臆']

In [16]:
c = h.root.HWDB10trn.writer007.features
c[:5]

array([[21,  8,  6, ...,  0,  0,  0],
       [ 2, 25,  9, ...,  1,  0,  0],
       [ 4, 13, 11, ...,  7,  1,  0],
       [11,  7,  3, ..., 51, 46,  5],
       [ 8,  2,  6, ..., 12,  6,  5]], dtype=uint8)

In [17]:
a = h.root.OLHWDB10trn.writer001.labels

In [18]:
a.read().decode().split(' ')

['扼',
 '遏',
 '鄂',
 '饿',
 '恩',
 '而',
 '儿',
 '耳',
 '尔',
 '饵',
 '洱',
 '二',
 '贰',
 '发',
 '罚',
 '筏',
 '伐',
 '乏',
 '阀',
 '法',
 '藩',
 '帆',
 '番',
 '翻',
 '樊',
 '矾',
 '钒',
 '繁',
 '凡',
 '烦',
 '反',
 '返',
 '范',
 '贩',
 '犯',
 '饭',
 '泛',
 '坊',
 '芳',
 '方',
 '肪',
 '房',
 '防',
 '妨',
 '仿',
 '访',
 '纺',
 '放',
 '菲',
 '非',
 '啡',
 '飞',
 '肥',
 '匪',
 '诽',
 '吠',
 '肺',
 '废',
 '沸',
 '费',
 '芬',
 '酚',
 '吩',
 '氛',
 '分',
 '纷',
 '坟',
 '焚',
 '汾',
 '粉',
 '奋',
 '份',
 '忿',
 '愤',
 '粪',
 '丰',
 '封',
 '枫',
 '蜂',
 '峰',
 '锋',
 '风',
 '疯',
 '烽',
 '逢',
 '冯',
 '缝',
 '讽',
 '奉',
 '凤',
 '佛',
 '否',
 '夫',
 '敷',
 '肤',
 '孵',
 '扶',
 '拂',
 '辐',
 '幅',
 '氟',
 '符',
 '伏',
 '俘',
 '服',
 '浮',
 '涪',
 '福',
 '袱',
 '弗',
 '甫',
 '抚',
 '辅',
 '俯',
 '釜',
 '斧',
 '脯',
 '腑',
 '府',
 '腐',
 '赴',
 '副',
 '覆',
 '赋',
 '复',
 '傅',
 '付',
 '阜',
 '父',
 '腹',
 '负',
 '富',
 '讣',
 '附',
 '妇',
 '缚',
 '咐',
 '噶',
 '嘎',
 '该',
 '改',
 '概',
 '钙',
 '盖',
 '溉',
 '干',
 '甘',
 '杆',
 '柑',
 '竿',
 '肝',
 '赶',
 '感',
 '秆',
 '敢',
 '赣',
 '冈',
 '刚',
 '钢',
 '缸',
 '肛',
 '纲',
 '岗',
 '港',
 '杠',
 '篙',
 '皋'

In [19]:
lsize = 0
for wr in h.root.OLHWDB10trn:
    lsize += len(wr.labels.read().decode().split(' '))
lsize - 1246991 # 为什么？

9018

In [20]:
lsize = 0
for wr in j.OLHWDB10trn:
    lsize += len(j.OLHWDB10trn[wr].features.index)
lsize - 1246991

9018