In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from src.data.dataset import EagleEmbeddingDataset

In [None]:
metadata = pd.read_csv("hf://datasets/MahmoodLab/hest/HEST_v1_1_0.csv")
metadata['oncotree_code'] = metadata['oncotree_code'].fillna('Healthy')

h5_file_path = "/equilibrium/datasets/TCGA-histological-data/eagle_output/features/halfmpp/eagle/default.h5"

In [None]:
metadata

In [None]:
unique_labels = metadata['oncotree_code'].unique().tolist()
label_map = {label: idx for idx, label in enumerate(unique_labels)}

In [None]:
test_size = 0.2
val_size = 0.2

In [None]:
train_df, temp_df = train_test_split(
        metadata, 
        test_size=test_size + (1-test_size)*val_size, # Es: 0.2 + (0.8)*0.5 = 0.6
        random_state=42, 
        stratify=metadata['oncotree_code']
    )
    
    # Seconda divisione: test e validation dal 'resto'
val_df, test_df = train_test_split(
        temp_df, 
        test_size=val_size, # Es: 0.5 di temp_df
        random_state=42, 
        stratify=temp_df['oncotree_code']
    )

    # Crea la mappa delle etichette (necessaria per ogni istanza)
unique_labels = sorted(metadata['oncotree_code'].unique())
label_to_id = {label: i for i, label in enumerate(unique_labels)}

    # 3. Crea le istanze del dataset
train_dataset = EagleEmbeddingDataset(h5_file_path, train_df, label_to_id)
val_dataset = EagleEmbeddingDataset(h5_file_path, val_df, label_to_id)
test_dataset = EagleEmbeddingDataset(h5_file_path, test_df, label_to_id)

In [None]:
# plot label distribution
import matplotlib.pyplot as plt
import seaborn as sns

def plot_label_distribution(df, title):
    plt.figure(figsize=(12, 6))
    sns.countplot(y='oncotree_code', data=df, order=df['oncotree_code'].value_counts().index)
    plt.title(title)
    plt.xlabel('Count')
    plt.ylabel('Oncotree Code')
    plt.show()

plot_label_distribution(metadata, 'Training Set Label Distribution')

In [None]:
metadata