In [1]:
import h5py
import os
import numpy as np
from PIL import Image
import chardet
import codecs

# 设置数据集路径和图像大小
data_path = 'MVSA_Single/data'
img_size = (224, 224)
max_len = 30
# 读取标签文件
labels = {}
with open('MVSA_Single/labelFirst.txt', 'r') as f:
    for line in f:
        index, label = line.strip().split('	')
        labels[index] = label

In [2]:
label_list = list(labels.values())
index_list = list(labels.keys())

In [3]:
# 创建h5py文件并设置数据集
with h5py.File('MVSA_Single/dataset.h5', 'w') as f:
    # 首先创建存储图像的dataset
    img_dataset = f.create_dataset('images', shape=(len(labels), img_size[0], img_size[1], 3), dtype='uint8')
    
    # 创建存储文字的dataset
    text_dataset = f.create_dataset('texts', shape=(len(labels),), dtype=h5py.special_dtype(vlen=str))
    
    # 创建存储标签的dataset
    label_dataset = f.create_dataset('labels', shape=(len(labels),), dtype=h5py.special_dtype(vlen=str))
    label_dataset[:] = label_list
    
    # 处理每个图片-文本对并将其添加到相应的数据集中
    for i in range(len(labels)):
        # 读取图片和文本
        img_filename = os.path.join(data_path, '{}.jpg'.format(index_list[i]))
        txt_filename = os.path.join(data_path, '{}.txt'.format(index_list[i]))
        with open(txt_filename, 'r', errors='ignore') as f:
            txt = f.read().strip()
        img = Image.open(img_filename)

        # 将图像调整为指定大小，并对齐文字
        width, height = img.size
        if width > height:
            scale_factor = float(img_size[0]) / float(height)
            new_width = int(scale_factor * width)
            img = img.resize((new_width, img_size[0]), resample=Image.LANCZOS)
            offset = (new_width - img_size[1]) // 2
            img = img.crop((offset, 0, offset + img_size[1], img_size[0]))
        else:
            scale_factor = float(img_size[1]) / float(width)
            new_height = int(scale_factor * height)
            img = img.resize((img_size[1], new_height), resample=Image.LANCZOS)
            offset = (new_height - img_size[0]) // 2
            img = img.crop((0, offset, img_size[1], offset + img_size[0]))
        
        # 将图片和文字添加到相应的数据集中
        img_dataset[i] = np.asarray(img, dtype='uint8')
        words = txt.split()
        text_dataset[i] = txt
       