In [1]:
import h5py
import numpy as np

In [2]:
def byte2str(data):
    data = list(data)
    for i, item in enumerate(data):
        data[i] = item.decode('utf-8')
    return data

In [3]:
def load_dataset(file_path, train_ratio=0.8, random_seed=1):
    # Load the dataset from the file
    with h5py.File( file_path , 'r') as f:
        data = {}
        for key in f.keys():
            data[key] = np.array(f[key])

    # Shuffle the data indices
    num_examples = len(data['labels'])
    indices = np.arange(num_examples)
    if random_seed is not None:
        np.random.seed(random_seed)
    np.random.shuffle(indices)

    # Split the data into training and testing sets
    split_index = int(num_examples * train_ratio)
    train_indices = indices[:split_index]
    test_indices = indices[split_index:]
    train_data = {key: data[key][train_indices] for key in data}
    test_data = {key: data[key][test_indices] for key in data}
    
    train_data['texts'], train_data['labels'] = byte2str(train_data['texts']), byte2str(train_data['labels'])
    test_data['texts'], test_data['labels'] = byte2str(test_data['texts']), byte2str(test_data['labels'])
    
    label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
    train_data['labels_ids'] = [label2id[label] for label in train_data['labels']]
    test_data['labels_ids'] = [label2id[label] for label in test_data['labels']]
    
    return train_data['images'], train_data['texts'], train_data['labels'], train_data['labels_ids'],\
           test_data['images'], test_data['texts'], test_data['labels'], test_data['labels_ids']


In [4]:
file_path = 'MVSA_Single/dataset.h5'
train_data , test_data = {} , {} 
train_data['image'], train_data['text'], train_data['label'], train_data['labels_id'],\
test_data['image'], test_data['text'], test_data['label'], test_data['labels_id']\
= load_dataset(file_path)