In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
import h5py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cluster import KMeans
import logging
import math
import numpy as np
import torch
from torch.nn import ModuleList
from functools import partial
from typing import List, Tuple, Dict, Optional, Any
from torchvision.transforms import Compose, RandomApply
from torchvision.transforms import functional as F
from torchvision.transforms.transforms import _setup_angle, _check_sequence_input
from torch import Tensor
from collections import defaultdict, deque
from pathlib import Path
from torch import nn
from PIL import ImageFilter, ImageOps, Image, ImageDraw

import os
# 设定实验结果的目录 (请修改为你的实际路径)
result_dir = "/home/yuhaowang/project/FMBC/finetune/outputs/BRACS_Fine"

# 需要展示的评估指标
evaluation_metrics = ['val_bacc', 'val_weighted_f1', 'val_macro_auroc']

# 你希望的模型顺序（从上到下）
desired_order = [
    "UNI", "CONCH", "Virchow","Gigapath_Tile",'Gigapath',"CHIEF_Tile","TITAN","FMBC"  # 请修改为你的模型名称
]

all_results = []

# 遍历目录中的所有模型文件夹
for model_name in os.listdir(result_dir):
    model_path = os.path.join(result_dir, model_name, "summary.csv")
    
    # 检查是否存在 summary.csv 文件
    if os.path.isfile(model_path):
        df = pd.read_csv(model_path)

        # 计算均值和标准差
        summary_stats = {"Model": model_name}
        for metric in evaluation_metrics:
            if metric in df.columns:
                mean_val = np.mean(df[metric])
                std_val = np.std(df[metric], ddof=1)  # 样本标准差
                summary_stats[metric] = f"{mean_val:.3f}±{std_val:.4f}"

        # 添加到列表
        all_results.append(summary_stats)

# 转换为 DataFrame
final_result_df = pd.DataFrame(all_results)

# 按照提供的模型顺序排序
final_result_df['sort_order'] = final_result_df['Model'].apply(lambda x: desired_order.index(x) if x in desired_order else len(desired_order))
#delete the model not in desired_order
final_result_df = final_result_df[final_result_df['sort_order']!=len(desired_order)]
final_result_df = final_result_df.sort_values(by='sort_order').drop(columns=['sort_order'])
final_result_df.style.hide(axis="index")
# 在 Jupyter Notebook 中美观显示
display(final_result_df)




In [None]:
import os
data_path = '/home/yuhaowang/data/processed_data'
embedding_path ='/data1/embedding'
for dataset in os.listdir(data_path):
    print(dataset,'has',len(os.listdir(os.path.join(data_path, dataset, 'output'))),'slides')
    dataset_embedding_path = os.path.join(embedding_path, dataset)
    if not os.path.exists(dataset_embedding_path):
        print(dataset, 'has no embedding')
        print('---------------------------')

        continue
    for model in os.listdir(dataset_embedding_path):
        print(model,'has',len(os.listdir(os.path.join(dataset_embedding_path, model))),'slides processed')
    print('---------------------------')
    


In [None]:
import  h5py
def read_assets_from_h5( h5_path: str) -> tuple:
    '''Read the assets from the h5 file'''
    assets = {}
    attrs = {}
    with h5py.File(h5_path, 'r') as f:
        for key in f.keys():
            assets[key] = f[key][:]
            if f[key].attrs is not None:
                attrs[key] = dict(f[key].attrs)
    return assets, attrs

data_dir = '/data1/embedding/TCGA-BRCA'
for model in os.listdir(data_dir):
    embedding_path=os.path.join(data_dir,model)
    test_case = os.listdir(embedding_path)[0]
    test_data,_ = read_assets_from_h5(os.path.join(embedding_path,test_case))
    print(test_data.keys())
    print(model)
    print(test_data['features'].shape)
    

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from lifelines.utils import concordance_index
#CUDA error: device-side assert triggered
import os
CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# Loss Function (provided by you)
class NLLSurvLoss(nn.Module):
    def __init__(self, alpha=0.0, eps=1e-7, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.reduction = reduction

    def __call__(self, x, y_bins, y_event):
        y_event = y_event.type(torch.int64)
        y_bins = y_bins.type(torch.int64)
        y_censor = 1 - y_event
        hazards = torch.sigmoid(x)
        S = torch.cumprod(1 - hazards, dim=1)
        S_padded = torch.cat([torch.ones_like(y_censor), S], 1)
        s_prev = torch.gather(S_padded, dim=1, index=y_bins).clamp(min=self.eps)
        h_this = torch.gather(hazards, dim=1, index=y_bins).clamp(min=self.eps)
        s_this = torch.gather(S_padded, dim=1, index=y_bins + 1).clamp(min=self.eps)
        uncensored_loss = -(1 - y_censor) * (torch.log(s_prev) + torch.log(h_this))
        censored_loss = -y_censor * torch.log(s_this)
        loss = uncensored_loss + self.alpha * censored_loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            raise ValueError(f"Invalid reduction type: {self.reduction}")



# Model Class
class SurvivalModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)  # Output dim = number of time bins
        )

    def forward(self, x):
        return self.network(x)

# C-index calculation
def calculate_cindex(model, dataloader, device):
    model.eval()
    all_preds = []
    all_times = []
    all_events = []
    
    with torch.no_grad():
        for batch in dataloader:
            features = batch['features'].to(device)
            events = batch['event'].numpy()
            times = batch['time_bin'].numpy()
            logits = model(features)
            hazards = torch.sigmoid(logits)
            survival = torch.cumprod(1 - hazards, dim=1)
            risk_scores = -survival.sum(dim=1).cpu().numpy()  # Negative sum of survival as risk score
            
            all_preds.extend(risk_scores)
            all_times.extend(times.flatten())
            all_events.extend(events.flatten())
    
    cindex = concordance_index(all_times, all_preds, all_events)
    return cindex

# Main training loop

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from lifelines.utils import concordance_index
import os


# SurvivalDataset 修改版
class SurvivalDataset(Dataset):
    def __init__(self, data_file, time_bins):
        df = pd.read_csv(data_file)
        df = df.dropna(subset=['OS_MONTHS'])
        self.features = torch.tensor(df['Sex'].values, dtype=torch.float32).unsqueeze(1)
        self.time_bins = time_bins
        
        event_np = df['OS_STATUS'].apply(lambda x: 1 if 'DECEASED' in x else 0).values
        time_np = df['OS_MONTHS'].values
        
        binned_time = np.digitize(time_np, bins=time_bins[:-1]) - 1
        num_bins = len(time_bins) - 1
        binned_time = np.where(event_np == 1, 
                              np.clip(binned_time, 0, num_bins - 1), 
                              np.clip(binned_time, 0, num_bins))
        self.binned_time = torch.tensor(binned_time, dtype=torch.int64).unsqueeze(1)
        self.event = torch.tensor(event_np, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'event': self.event[idx],
            'time_bin': self.binned_time[idx]
        }

# 主训练函数修改版
def train_survival_model():
    input_dim = 1
    hidden_dim = 32
    num_bins = 10
    batch_size = 32
    num_epochs = 50
    learning_rate = 0.001
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    data_file = './TCGA-BRCA-KM.csv'
    df = pd.read_csv(data_file)
    max_time = df['OS_MONTHS'].max() + 1
    time_bins = np.linspace(0, max_time, num_bins + 1)

    dataset = SurvivalDataset(data_file, time_bins)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = SurvivalModel(input_dim, hidden_dim, num_bins)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = NLLSurvLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            features = batch['features'].to(device)
            time_bins = batch['time_bin'].to(device)
            events = batch['event'].to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, time_bins, events)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        train_cindex = calculate_cindex(model, train_loader, device)
        test_cindex = calculate_cindex(model, test_loader, device)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, '
              f'Train C-index: {train_cindex:.4f}, Test C-index: {test_cindex:.4f}')

    final_test_cindex = calculate_cindex(model, test_loader, device)
    print(f'Final Test C-index: {final_test_cindex:.4f}')

if __name__ == "__main__":
    train_survival_model()


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [32]:
import os
data_dir = '/home/yuhaowang/data/'
save_dir = '/data4/embedding'
def get_unprocessed_datasets(data_dir, processed_dir):
    """获取未处理的数据集"""
    all_datasets = os.listdir(data_dir)
    processed_datasets = os.listdir(processed_dir) if os.path.exists(processed_dir) else []
    
    #unprocessd_dataset= [d for d in all_datasets if len(os.listdir(os.path.join(data_dir, d,'output'))) != len(os.listdir(os.path.join(processed_dir, d, 'FMBC')))]
    unprocessed_dataset = []
    for d in all_datasets:
        if len(os.listdir(os.path.join(data_dir, d, 'output'))) - len(os.listdir(os.path.join(processed_dir, d, 'FMBC')))>10:
            unprocessed_dataset.append(d)
    return unprocessed_dataset
    

In [33]:
dataset = get_unprocessed_datasets(data_dir, save_dir)