In [77]:

import h5py
import numpy as np
import os


file_path = "/Users/jeethpawar/be228/TCGA/LUAD/TCGA-34-2605/TCGA-34-2605-01Z-00-DX1.h5"




if os.path.exists(file_path):
    with h5py.File(file_path, "r") as h5_file:
        
        datasets = list(h5_file.keys())
        print(f"Datasets inside the file: {datasets}")

        # Extract & Check the Features Dataset
        if "features" in datasets:
            features = h5_file["features"][:]  # Load all feature data
            print(f"\nShape of 'features': {features.shape}")  # Expected (num_tiles, 1024)
            print(f"Sample Features (first 5 rows):\n{features[:5]}")  # Print first 5 feature vectors

       
        if "coords" in datasets:
            coords = h5_file["coords"][:]
            print(f"\nShape of 'coords': {coords.shape}")  # Expected (num_tiles, 2)
         


✅ File found! Proceeding to open the HDF5 file...
Datasets inside the file: ['coords', 'features']

Shape of 'features': (11118, 1024)
Sample Features (first 5 rows):
[[ 1.4743148  -0.66672665 -1.974797   ... -0.3268076   2.287257
   0.6425188 ]
 [ 0.49487564 -0.9996067  -1.453786   ... -0.70884335  0.656675
   0.96511996]
 [-1.0826533   0.16203742 -2.3214774  ...  0.04512507 -0.06246603
   0.34042126]
 [-2.598149   -1.0536268  -1.3663765  ...  0.65569067 -0.8721822
  -1.6494672 ]
 [-0.732723   -1.2198707  -2.5369122  ...  1.1126738  -0.04896246
  -0.5632508 ]]

Shape of 'coords': (11118, 2)
Ignoring 'coords' as instructed.


In [82]:

import h5py
import numpy as np
import os

# Define Paths to LUAD and LUSC Folders
base_path = "/Users/jeethpawar/be228/TCGA"
luad_path = os.path.join(base_path, "LUAD")  # Folder containing LUAD .h5 files
lusc_path = os.path.join(base_path, "LUSC")  # Folder containing LUSC .h5 files


X = []  # Feature vectors
y = []  # Labels (0 = LUAD, 1 = LUSC)

# 📌 Step 4: Function to Load Features from HDF5 Files
def load_features_from_folder(folder_path, label):
    """Loads features from all .h5 files in a folder and assigns a label."""
    for subdir, _, files in os.walk(folder_path):  # Walk through subdirectories
        for file in files:
            if file.endswith(".h5"):  # Ensure it's an HDF5 file
                file_path = os.path.join(subdir, file)
                with h5py.File(file_path, "r") as h5_file:
                    if "features" in h5_file:
                        features = h5_file["features"][:]  # Extract features
                        X.append(features)  # Append feature matrix
                        y.append(np.full(features.shape[0], label))  # Assign labels


load_features_from_folder(luad_path, label=0)  # LUAD -> 0
load_features_from_folder(lusc_path, label=1)  # LUSC -> 1


X = np.vstack(X)  # Stack all feature matrices into a single dataset
y = np.concatenate(y)  # Flatten labels into a single array

# 📌 Step 7: Print Dataset Information
print(f"Data Loaded Successfully!")
print(f"Total Samples: {X.shape[0]}")
print(f"Feature Dimension: {X.shape[1]}")
print(f"Labels Distribution: {np.bincount(y)}")  # Count LUAD (0) and LUSC (1)


Data Loaded Successfully!
Total Samples: 71798
Feature Dimension: 1024
Labels Distribution: [34126 37672]


In [80]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.5.0


In [81]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Standardize feature values


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)


y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]  # Get probability scores


accuracy = accuracy_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_prob)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")


print(f" Logistic Regression Model Trained Successfully!")
print(f" Accuracy: {accuracy:.4f}")
print(f" AUROC Score: {auroc:.4f}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1 Score: {f1:.4f}")


✅ Logistic Regression Model Trained Successfully!
🔹 Accuracy: 0.9997
🔹 AUROC Score: 1.0000
🔹 Precision: 0.9999
🔹 Recall: 0.9995
🔹 F1 Score: 0.9997
