In [1]:
import h5py
import os
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import datasets
from datasets import Dataset, Sequence, Value

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
FEATURES_FOLDER = '/equilibrium/datasets/TCGA-histological-data/hest_dataset/patches_embeddings'
SLIDE_EMBEDDINGS_FOLDER = '/equilibrium/datasets/TCGA-histological-data/hest_dataset/slide_embeddings'

In [3]:

data = []

for fname in tqdm(sorted(os.listdir(FEATURES_FOLDER))):
    if not fname.endswith('.h5'):
        continue

    sample_id = fname.replace('.h5', '')
    target_path = os.path.join(SLIDE_EMBEDDINGS_FOLDER, f"{sample_id}_embedding.npy")
    
    # Salta se non esiste il target
    if not os.path.exists(target_path):
        continue

    # Carica le features
    feature_path = os.path.join(FEATURES_FOLDER, fname)
    with h5py.File(feature_path, 'r') as hf:
        features = hf['embeddings'][:].astype(np.float32)

    # Carica il target
    target = np.load(target_path).astype(np.float32)


    if len(features[0]) != len(target[0]):
        raise ValueError(f"Feature length {len(features[0])} does not match target length {len(target[0])} for sample {sample_id}.")
    
    data.append({
        'id': sample_id,
        'features': features,
        'targets': target[0]
    })

# === CREA DATASET HUGGING FACE ===
dataset = Dataset.from_list(data)

  0%|          | 0/121 [00:00<?, ?it/s]

100%|██████████| 121/121 [00:00<00:00, 191.09it/s]


In [4]:
dataset = dataset.train_test_split(test_size=0.2)

In [5]:

dataset.save_to_disk('/equilibrium/datasets/TCGA-histological-data/huggingfac/KN_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/94 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 94/94 [00:02<00:00, 33.53 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 24/24 [00:00<00:00, 53.32 examples/s] 


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'features', 'targets'],
        num_rows: 94
    })
    test: Dataset({
        features: ['id', 'features', 'targets'],
        num_rows: 24
    })
})

In [7]:
type(dataset['train']['features'][0])

list