In [71]:
import h5py
import os

def rename_h5_key(h5_path: str, old_key: str, new_key: str) -> None:
    """
    Rename a key in an HDF5 file and save the changes.
    
    Parameters:
        h5_path (str): Path to the HDF5 file.
        old_key (str): The existing key to rename.
        new_key (str): The new key name.
    """
    with h5py.File(h5_path, 'r+') as f:
        # Check if the old key exists
        if old_key in f:
            # Copy the data to the new key
            f[new_key] = f[old_key][:]
            
            # Copy attributes if any
            for attr_name, attr_value in f[old_key].attrs.items():
                f[new_key].attrs[attr_name] = attr_value
            
            # Delete the old key
            del f[old_key]
            print(f"Renamed '{old_key}' to '{new_key}' in file: {h5_path}")
        else:
            print(f"Key '{old_key}' not found in file: {h5_path}")
data_dir = '/ruiyan/yuhao/tile_embed/BCNB/TITAN'

# fets -> features
for file in os.listdir(data_dir):
    if file.endswith('.h5'):
        file_path = os.path.join(data_dir, file)
        rename_h5_key(file_path, old_key='feats', new_key='features')

Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_894.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_940.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_476.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_164.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_697.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_377.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_148.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_973.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_153.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_351.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_854.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_embed/BCNB/TITAN/bcnb_265.h5
Key 'feats' not found in file: /ruiyan/yuhao/tile_em

In [7]:
import pandas as pd
data = pd.read_csv('/ruiyan/yuhao/project/FMBC/finetune/dataset_csv/subtype/CPTAC_IDC.csv')
#map label columnt 0,1 to IDC and non-IDC
data['label'] = data['label'].map({1: 'IDC', 0: 'non-IDC'})
data.to_csv('/ruiyan/yuhao/project/FMBC/finetune/dataset_csv/subtype/CPTAC_IDC.csv', index=False)

In [22]:
#/ruiyan/yuhao/embedding/IMPRESS
import pandas as pd
import pandas as pd
df=pd.read_csv('/ruiyan/yuhao/project/FMBC/finetune/dataset_csv/sample/IMPRESS_TNBC_2subtype.csv')
# Assume `df` is your original dataframe
# Create a copy of the dataframe for '_he' and '_ihc'
df_he = df.copy()
df_ihc = df.copy()
#zfill slide_id column to 3 digits
df_he['slide_id'] =df['slide_id'].astype(str)+'_HE'
df_ihc['slide_id'] =df['slide_id'].astype(str)+'_IHC'



# Concatenate the two dataframes vertically
df_combined = pd.concat([df_he, df_ihc], ignore_index=True)

df_combined.to_csv('/ruiyan/yuhao/project/FMBC/finetune/dataset_csv/biomarker/IMPRESS_TNBC_2subtype.csv', index=False)

In [54]:
import os
import glob
data_dir= '/ruiyan/yuhao/tile_embed/TCGA-BRCA'
h5_files = glob.glob(os.path.join(data_dir, '**/*.h5'))

In [78]:
import os
from pathlib import Path
import glob
data_dir = '/ruiyan/yuhao/data'
num_patch=0
num_slides=0
for dir in os.listdir(data_dir):
    target_dir = os.path.join(data_dir, dir,'output')
        #find how many png file in the target_dir, glob
    png_files = glob.glob(os.path.join(target_dir, '**/*.png'))
    slides= len(os.listdir(os.path.join(data_dir, dir,'output')))
    num_slides+=slides
    #print(len(png_files))
    print(slides)
    print(len(png_files))
    num_patch+=len(png_files)

2788
8242984
2788
8138453
2696
8382726
2596
7728870


KeyboardInterrupt: 

In [2]:
data_dir='/ruiyan/yuhao/tile_embed/embedding/IMPRESS'
import os
for dir in os.listdir(data_dir):
    #Gigapath_tile -> Gigapath_Tile
    #CHIEF_tile -> CHIEF_Tile
    print(dir)
    if dir in ['Gigapath_tile', 'CHIEF_tile']:
        new_dir = dir.replace('_tile', '_Tile')
        os.rename(os.path.join(data_dir, dir), os.path.join(data_dir, new_dir))

Gigapath_tile
TITAN
CONCH
Gigapath
Virchow
UNI
CHIEF_tile


In [22]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from lifelines.utils import concordance_index
#CUDA error: device-side assert triggered
import os
CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# Loss Function (provided by you)
class NLLSurvLoss(nn.Module):
    def __init__(self, alpha=0.0, eps=1e-7, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.reduction = reduction

    def __call__(self, x, y_bins, y_event):
        y_event = y_event.type(torch.int64)
        y_bins = y_bins.type(torch.int64)
        y_censor = 1 - y_event
        hazards = torch.sigmoid(x)
        S = torch.cumprod(1 - hazards, dim=1)
        S_padded = torch.cat([torch.ones_like(y_censor), S], 1)
        s_prev = torch.gather(S_padded, dim=1, index=y_bins).clamp(min=self.eps)
        h_this = torch.gather(hazards, dim=1, index=y_bins).clamp(min=self.eps)
        s_this = torch.gather(S_padded, dim=1, index=y_bins + 1).clamp(min=self.eps)
        uncensored_loss = -(1 - y_censor) * (torch.log(s_prev) + torch.log(h_this))
        censored_loss = -y_censor * torch.log(s_this)
        loss = uncensored_loss + self.alpha * censored_loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            raise ValueError(f"Invalid reduction type: {self.reduction}")

# Dataset Class
class SurvivalDataset(Dataset):
    def __init__(self, data_file, time_bins):
        # Load and preprocess data
        df = pd.read_csv(data_file)
        df = df.dropna(subset=['OS_MONTHS'])  # Remove rows with missing OS_MONTHS
        self.features = torch.tensor(df['Sex'].values, dtype=torch.float32).unsqueeze(1)
        self.features = torch.tensor(df['Sex'].values, dtype=torch.float32).unsqueeze(1)  # Sex as feature
        self.time_bins = time_bins
        self.event = torch.tensor(df['OS_STATUS'].apply(lambda x: 1 if 'DECEASED' in x else 0).values, 
                                dtype=torch.float32).unsqueeze(1)
        self.time = df['OS_MONTHS'].values
        
        # Discretize time into bins
        self.binned_time = np.digitize(self.time, bins=time_bins[:-1])
        self.binned_time = np.clip(self.binned_time, 0, len(time_bins) - 1)
        self.binned_time = torch.tensor(self.binned_time, dtype=torch.int64).unsqueeze(1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'event': self.event[idx],
            'time_bin': self.binned_time[idx]
        }

# Model Class
class SurvivalModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)  # Output dim = number of time bins
        )

    def forward(self, x):
        return self.network(x)

# C-index calculation
def calculate_cindex(model, dataloader, device):
    model.eval()
    all_preds = []
    all_times = []
    all_events = []
    
    with torch.no_grad():
        for batch in dataloader:
            features = batch['features'].to(device)
            events = batch['event'].numpy()
            times = batch['time_bin'].numpy()
            logits = model(features)
            hazards = torch.sigmoid(logits)
            survival = torch.cumprod(1 - hazards, dim=1)
            risk_scores = -survival.sum(dim=1).cpu().numpy()  # Negative sum of survival as risk score
            
            all_preds.extend(risk_scores)
            all_times.extend(times.flatten())
            all_events.extend(events.flatten())
    
    cindex = concordance_index(all_times, all_preds, all_events)
    return cindex

# Main training loop
def train_survival_model():
    # Hyperparameters
    input_dim = 1  # Sex only
    hidden_dim = 32
    num_bins = 10  # Number of time bins
    batch_size = 32
    num_epochs = 50
    learning_rate = 0.001
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create time bins (example: equal intervals up to max time)
    max_time = 150  # Adjust based on your data
    time_bins = np.linspace(0, max_time, num_bins + 1)

    # Prepare data
    data_file = 'TCGA-BRCA-KM.csv'  # Save your data as CSV first
    
    dataset = SurvivalDataset(data_file, time_bins)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model and optimizer
    model = SurvivalModel(input_dim, hidden_dim, num_bins).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = NLLSurvLoss()

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            features = batch['features'].to(device)
            time_bins = batch['time_bin'].to(device)
            events = batch['event'].to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, time_bins, events)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Calculate C-index
        train_cindex = calculate_cindex(model, train_loader, device)
        test_cindex = calculate_cindex(model, test_loader, device)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, '
              f'Train C-index: {train_cindex:.4f}, Test C-index: {test_cindex:.4f}')

    # Final evaluation
    final_test_cindex = calculate_cindex(model, test_loader, device)
    print(f'Final Test C-index: {final_test_cindex:.4f}')

if __name__ == "__main__":
    train_survival_model()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
