In [None]:
import os
import random
import h5py
import numpy as np
from sklearn.model_selection import train_test_split

# 💌 Step 1: Define the TCGA Folder Path
base_path = "/Users/jeethpawar/be228/TCGA"  # Update with your actual path

# 💌 Step 2: List All `.h5` Files (Slides)
h5_files = [f for f in os.listdir(base_path) if f.endswith(".h5")]

# 💌 Step 3: Randomly Split Files into 80% Train, 20% Test
random.seed(42)  # Ensure reproducibility
train_files, test_files = train_test_split(h5_files, test_size=0.2, random_state=42)

# 💌 Step 4: Initialize Feature and Label Lists
X_train, y_train, X_test, y_test = [], [], [], []

# 💌 Step 5: Function to Load Features from `.h5` Files
def load_features_from_h5(file_list, dataset_type):
    """Loads features and assigns labels for a list of `.h5` files."""
    for file_name in file_list:
        file_path = os.path.join(base_path, file_name)

        with h5py.File(file_path, "r") as h5_file:
            if "features" in h5_file:
                features = h5_file["features"][:]  # Load all tiles' features
                
                # Determine label based on file location
                label = 0 if "LUAD" in file_name else 1  # Assuming LUAD & LUSC naming convention
                
                # Assign data to correct dataset
                if dataset_type == "train":
                    X_train.append(features)
                    y_train.append(np.full(features.shape[0], label))  # Assign label to all tiles
                else:
                    X_test.append(features)
                    y_test.append(np.full(features.shape[0], label))

# 💌 Step 6: Load Data into Train and Test Sets
load_features_from_h5(train_files, dataset_type="train")
load_features_from_h5(test_files, dataset_type="test")

# 💌 Step 7: Convert Lists to Numpy Arrays
X_train = np.vstack(X_train)  # Stack feature arrays
y_train = np.concatenate(y_train)  # Merge labels
X_test = np.vstack(X_test)
y_test = np.concatenate(y_test)

# 💌 Step 8: Print Dataset Summary
print(f"✅ Slide-Based Split Completed!")
print(f"Total Training Slides: {len(train_files)}, Training Samples: {X_train.shape[0]}")
print(f"Total Testing Slides: {len(test_files)}, Testing Samples: {X_test.shape[0]}")
print(f"Feature Dimension: {X_train.shape[1]}")
print(f"Training Labels Distribution: {np.bincount(y_train)}")
print(f"Testing Labels Distribution: {np.bincount(y_test)}")
