In [None]:
import os
import pandas as pd
import numpy as np
from IPython.display import display
import h5py
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cluster import KMeans
import logging
import math
import numpy as np
import torch
from torch.nn import ModuleList
from functools import partial
from typing import List, Tuple, Dict, Optional, Any
from torchvision.transforms import Compose, RandomApply
from torchvision.transforms import functional as F
from torchvision.transforms.transforms import _setup_angle, _check_sequence_input
from torch import Tensor
from collections import defaultdict, deque
from pathlib import Path
from torch import nn
from PIL import ImageFilter, ImageOps, Image, ImageDraw

import os
# 设定实验结果的目录 (请修改为你的实际路径)
result_dir = "/home/yuhaowang/project/FMBC/finetune/outputs/BRACS_Fine"

# 需要展示的评估指标
evaluation_metrics = ['val_bacc', 'val_weighted_f1', 'val_macro_auroc']

# 你希望的模型顺序（从上到下）
desired_order = [
    "UNI", "CONCH", "Virchow","Gigapath_Tile",'Gigapath',"CHIEF_Tile","TITAN","FMBC"  # 请修改为你的模型名称
]

all_results = []

# 遍历目录中的所有模型文件夹
for model_name in os.listdir(result_dir):
    model_path = os.path.join(result_dir, model_name, "summary.csv")
    
    # 检查是否存在 summary.csv 文件
    if os.path.isfile(model_path):
        df = pd.read_csv(model_path)

        # 计算均值和标准差
        summary_stats = {"Model": model_name}
        for metric in evaluation_metrics:
            if metric in df.columns:
                mean_val = np.mean(df[metric])
                std_val = np.std(df[metric], ddof=1)  # 样本标准差
                summary_stats[metric] = f"{mean_val:.3f}±{std_val:.4f}"

        # 添加到列表
        all_results.append(summary_stats)

# 转换为 DataFrame
final_result_df = pd.DataFrame(all_results)

# 按照提供的模型顺序排序
final_result_df['sort_order'] = final_result_df['Model'].apply(lambda x: desired_order.index(x) if x in desired_order else len(desired_order))
#delete the model not in desired_order
final_result_df = final_result_df[final_result_df['sort_order']!=len(desired_order)]
final_result_df = final_result_df.sort_values(by='sort_order').drop(columns=['sort_order'])
final_result_df.style.hide(axis="index")
# 在 Jupyter Notebook 中美观显示
display(final_result_df)




In [None]:
import os
data_path = '/home/yuhaowang/data/processed_data'
embedding_path ='/data1/embedding'
for dataset in os.listdir(data_path):
    print(dataset,'has',len(os.listdir(os.path.join(data_path, dataset, 'output'))),'slides')
    dataset_embedding_path = os.path.join(embedding_path, dataset)
    if not os.path.exists(dataset_embedding_path):
        print(dataset, 'has no embedding')
        print('---------------------------')

        continue
    for model in os.listdir(dataset_embedding_path):
        print(model,'has',len(os.listdir(os.path.join(dataset_embedding_path, model))),'slides processed')
    print('---------------------------')
    


In [None]:
import  h5py
def read_assets_from_h5( h5_path: str) -> tuple:
    '''Read the assets from the h5 file'''
    assets = {}
    attrs = {}
    with h5py.File(h5_path, 'r') as f:
        for key in f.keys():
            assets[key] = f[key][:]
            if f[key].attrs is not None:
                attrs[key] = dict(f[key].attrs)
    return assets, attrs

data_dir = '/data1/embedding/TCGA-BRCA'
for model in os.listdir(data_dir):
    embedding_path=os.path.join(data_dir,model)
    test_case = os.listdir(embedding_path)[0]
    test_data,_ = read_assets_from_h5(os.path.join(embedding_path,test_case))
    print(test_data.keys())
    print(model)
    print(test_data['features'].shape)
    

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from lifelines.utils import concordance_index
#CUDA error: device-side assert triggered
import os
CUDA_LAUNCH_BLOCKING=1
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# Loss Function (provided by you)
class NLLSurvLoss(nn.Module):
    def __init__(self, alpha=0.0, eps=1e-7, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.eps = eps
        self.reduction = reduction

    def __call__(self, x, y_bins, y_event):
        y_event = y_event.type(torch.int64)
        y_bins = y_bins.type(torch.int64)
        y_censor = 1 - y_event
        hazards = torch.sigmoid(x)
        S = torch.cumprod(1 - hazards, dim=1)
        S_padded = torch.cat([torch.ones_like(y_censor), S], 1)
        s_prev = torch.gather(S_padded, dim=1, index=y_bins).clamp(min=self.eps)
        h_this = torch.gather(hazards, dim=1, index=y_bins).clamp(min=self.eps)
        s_this = torch.gather(S_padded, dim=1, index=y_bins + 1).clamp(min=self.eps)
        uncensored_loss = -(1 - y_censor) * (torch.log(s_prev) + torch.log(h_this))
        censored_loss = -y_censor * torch.log(s_this)
        loss = uncensored_loss + self.alpha * censored_loss
        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            raise ValueError(f"Invalid reduction type: {self.reduction}")



# Model Class
class SurvivalModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)  # Output dim = number of time bins
        )

    def forward(self, x):
        return self.network(x)

# C-index calculation
def calculate_cindex(model, dataloader, device):
    model.eval()
    all_preds = []
    all_times = []
    all_events = []
    
    with torch.no_grad():
        for batch in dataloader:
            features = batch['features'].to(device)
            events = batch['event'].numpy()
            times = batch['time_bin'].numpy()
            logits = model(features)
            hazards = torch.sigmoid(logits)
            survival = torch.cumprod(1 - hazards, dim=1)
            risk_scores = -survival.sum(dim=1).cpu().numpy()  # Negative sum of survival as risk score
            
            all_preds.extend(risk_scores)
            all_times.extend(times.flatten())
            all_events.extend(events.flatten())
    
    cindex = concordance_index(all_times, all_preds, all_events)
    return cindex

# Main training loop

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from lifelines.utils import concordance_index
import os


# SurvivalDataset 修改版
class SurvivalDataset(Dataset):
    def __init__(self, data_file, time_bins):
        df = pd.read_csv(data_file)
        df = df.dropna(subset=['OS_MONTHS'])
        self.features = torch.tensor(df['Sex'].values, dtype=torch.float32).unsqueeze(1)
        self.time_bins = time_bins
        
        event_np = df['OS_STATUS'].apply(lambda x: 1 if 'DECEASED' in x else 0).values
        time_np = df['OS_MONTHS'].values
        
        binned_time = np.digitize(time_np, bins=time_bins[:-1]) - 1
        num_bins = len(time_bins) - 1
        binned_time = np.where(event_np == 1, 
                              np.clip(binned_time, 0, num_bins - 1), 
                              np.clip(binned_time, 0, num_bins))
        self.binned_time = torch.tensor(binned_time, dtype=torch.int64).unsqueeze(1)
        self.event = torch.tensor(event_np, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'event': self.event[idx],
            'time_bin': self.binned_time[idx]
        }

# 主训练函数修改版
def train_survival_model():
    input_dim = 1
    hidden_dim = 32
    num_bins = 10
    batch_size = 32
    num_epochs = 50
    learning_rate = 0.001
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    data_file = './TCGA-BRCA-KM.csv'
    df = pd.read_csv(data_file)
    max_time = df['OS_MONTHS'].max() + 1
    time_bins = np.linspace(0, max_time, num_bins + 1)

    dataset = SurvivalDataset(data_file, time_bins)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = SurvivalModel(input_dim, hidden_dim, num_bins)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = NLLSurvLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            features = batch['features'].to(device)
            time_bins = batch['time_bin'].to(device)
            events = batch['event'].to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, time_bins, events)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        train_cindex = calculate_cindex(model, train_loader, device)
        test_cindex = calculate_cindex(model, test_loader, device)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}, '
              f'Train C-index: {train_cindex:.4f}, Test C-index: {test_cindex:.4f}')

    final_test_cindex = calculate_cindex(model, test_loader, device)
    print(f'Final Test C-index: {final_test_cindex:.4f}')

if __name__ == "__main__":
    train_survival_model()


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [32]:
import os
data_dir = '/home/yuhaowang/data/'
save_dir = '/data4/embedding'
def get_unprocessed_datasets(data_dir, processed_dir):
    """获取未处理的数据集"""
    all_datasets = os.listdir(data_dir)
    processed_datasets = os.listdir(processed_dir) if os.path.exists(processed_dir) else []
    
    #unprocessd_dataset= [d for d in all_datasets if len(os.listdir(os.path.join(data_dir, d,'output'))) != len(os.listdir(os.path.join(processed_dir, d, 'FMBC')))]
    unprocessed_dataset = []
    for d in all_datasets:
        if len(os.listdir(os.path.join(data_dir, d, 'output'))) - len(os.listdir(os.path.join(processed_dir, d, 'FMBC')))>10:
            unprocessed_dataset.append(d)
    return unprocessed_dataset
    

In [33]:
dataset = get_unprocessed_datasets(data_dir, save_dir)

In [5]:
import pandas as pd 
df = pd.read_csv('/home/yuhaowang/project/FMBC/finetune/dataset_csv/survival/TCGA-BRCA-KM.csv')
df

Unnamed: 0,Study_ID,Patient_ID,status,time,Sex,label
0,brca_tcga,TCGA-3C-AAAU,0.0,132.95,0,1.0
1,brca_tcga,TCGA-3C-AALI,0.0,131.57,0,1.0
2,brca_tcga,TCGA-3C-AALJ,0.0,48.42,0,1.0
3,brca_tcga,TCGA-3C-AALK,0.0,47.57,0,1.0
4,brca_tcga,TCGA-4H-AAAK,0.0,11.43,0,1.0
...,...,...,...,...,...,...
1096,brca_tcga,TCGA-WT-AB44,0.0,29.01,1,1.0
1097,brca_tcga,TCGA-XX-A899,0.0,15.34,1,1.0
1098,brca_tcga,TCGA-XX-A89A,0.0,16.03,1,1.0
1099,brca_tcga,TCGA-Z7-A8R5,0.0,107.98,1,1.0


In [12]:
df.status = 1-df.status
#pass time and label is ok 
import pandas as pd
data = pd.read_csv('/home/yuhaowang/project/Baseline/CMTA/csv/tcga_brca_all_clean.csv')
data.head()
#keep only some columns,  ase_id	slide_id	site	is_female	oncotree_code	age	survival_months	censorship
temp = data[['case_id','slide_id','survival_months','censorship']]


In [14]:
temp.to_csv('/home/yuhaowang/project/FMBC/finetune/dataset_csv/survival/TCGA-BRCA-KM.csv')


In [16]:
import pandas as pd
data = pd.read_csv('/home/yuhaowang/project/FMBC/finetune/dataset_csv/survival/TCGA-BRCA-Survival.csv')
data

Unnamed: 0.1,Unnamed: 0,case_id,slide_id,survival_months,censorship
0,0,TCGA-3C-AALI,TCGA-3C-AALI-01Z-00-DX1.F6E9A5DF-D8FB-45CF-B4B...,131.57,1.0
1,1,TCGA-3C-AALI,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...,131.57,1.0
2,2,TCGA-3C-AALJ,TCGA-3C-AALJ-01Z-00-DX1.777C0957-255A-42F0-9EE...,48.42,1.0
3,3,TCGA-3C-AALJ,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...,48.42,1.0
4,4,TCGA-3C-AALK,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...,47.57,1.0
...,...,...,...,...,...
1018,1018,TCGA-WT-AB44,TCGA-WT-AB44-01Z-00-DX1.B6ECEA7C-DA26-4B34-88C...,29.01,1.0
1019,1019,TCGA-XX-A899,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...,15.34,1.0
1020,1020,TCGA-XX-A89A,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...,16.03,1.0
1021,1021,TCGA-Z7-A8R5,TCGA-Z7-A8R5-01Z-00-DX1.3BDB407F-514C-4131-B05...,107.98,1.0


# Preprocess the survival data

In [None]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "/home/yuhaowang/project/FMBC/finetune/dataset_csv/survival/TCGA-BRCA-Survival.csv"
df = pd.read_csv(file_path, low_memory=False)

# Check for necessary columns and preprocess
required_columns = ['case_id', 'survival_months', 'censorship']

# Ensure all required columns exist
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Drop duplicates based on case_id to get unique patients
df = df.drop_duplicates(subset=['case_id']).reset_index(drop=True)

# Define bins for discretization of survival months
n_bins = 4
eps = 1e-6  # Small value to adjust bin edges

# Create quantile-based bins for survival months
uncensored_df = df[df['censorship'] < 1]  # Only uncensored data for binning
disc_labels, q_bins = pd.qcut(uncensored_df['survival_months'], q=n_bins, retbins=True, labels=False)

# Adjust bin edges
q_bins[0] = df['survival_months'].min() - eps
q_bins[-1] = df['survival_months'].max() + eps

# Apply binning to all patients
df['disc_label'], _ = pd.cut(df['survival_months'], bins=q_bins, labels=False, retbins=True, right=False, include_lowest=True)
df['disc_label'] = df['disc_label'].astype(int)  # Ensure integer labels

# Save the processed dataset
# processed_file_path = "/mnt/data/TCGA-BRCA-Survival-Processed.csv"
df.to_csv("/home/yuhaowang/project/FMBC/finetune/dataset_csv/survival/TCGA-BRCA-Survival.csv", index=False)


In [9]:
#/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_HER2_2subtype.csv

import pandas as pd 


# 读取CSV文件
file_path = '/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_HER2_2subtype.csv'
df = pd.read_csv(file_path)

# 确保 'slide id' 存在
if 'slide_id' in df.columns:
    slide_id_col = df['slide_id']
    
    # 遍历其他列，分别创建新CSV
    for column in df.columns:
        if column != 'slide_id':
            new_df = pd.DataFrame({'slide_id': slide_id_col, column: df[column]})
            new_file_path = f"/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_{column}.csv"
            new_df.to_csv(new_file_path, index=False)
            print(f"Generated: {new_file_path}")
else:
    print("Error: 'slide id' column not found in the CSV file.")


Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_ER.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_PR.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_pCR.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_PD-L1-tumor.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_PD-L1-stroma.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_CD8-peritumoral.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_CD8-intratumoral.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_CD163-intratumoral.csv
Generated: /home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_CD163-peritumoral.csv


In [10]:
file_path = '/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_TNBC_2subtype.csv'
df = pd.read_csv(file_path)

In [15]:
#df.drop(columns=['pCR (no-0, yes-1)'], inplace=True)
df.to_csv('/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/biomarker/IMPRESS_TNBC_2subtype.csv', index=False)

In [3]:
#/data4/fm_embedding/embedding/TCGA-BRCA
#glob matching all h5 file recursively, delete name contain DX1
import os
import glob
data_dir = '/data4/fm_embedding/embedding/TCGA-BRCA'
for filename in glob.iglob(os.path.join(data_dir, '**/*.h5'), recursive=True):
    if 'DX1' in filename or 'DX2' in filename or 'DX3' in filename or 'DX4' in filename:
        os.remove(filename)
        print(f"Deleted: {filename}")


Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-D8-A3Z6-01Z-00-DX3.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-PL-A8LZ-01A-03-DX3.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-OK-A5Q2-01Z-00-DX3.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-OK-A5Q2-01Z-00-DX3.5F9215C3-E407-46F8-968E-503D7D14605C.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-PL-A8LZ-01A-03-DX3.E5D16DBF-CABD-4C96-A794-5F27BC305055.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-OK-A5Q2-01Z-00-DX4.83B45D6C-E350-4436-812F-4155D9F7D331.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-D8-A3Z5-01Z-00-DX3.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-OK-A5Q2-01Z-00-DX4.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/CONCH/TCGA-D8-A3Z5-01Z-00-DX3.BB83C7D4-F795-47E7-9A5C-7DBF0EB7FDAA.h5
Deleted: /data4/fm_embedding/embedding/TCGA-BRCA/UNI/TCGA-D8-A3Z6-01Z-00-DX3.h5
Deleted: /data4/fm_embedding/embed

In [27]:
file_path = '/home/yuhaowang/project/FMBC/downstream/finetune/dataset_csv/subtype/DORID_2.csv'
import pandas as pd 
data = pd.read_csv(file_path)


In [28]:
data

Unnamed: 0,slide_id,IDC
0,AI-DRBR-001_2523,Invasive carcinoma of breast (disorder)
1,AI-DRBR-002_1975,Invasive carcinoma of breast (disorder)
2,AI-DRBR-003_2478,Invasive carcinoma of breast (disorder)
3,AI-DRBR-004_1358,Invasive carcinoma of breast (disorder)
4,AI-DRBR-005_1984,non-invasive carcinoma
...,...,...
283,AI-DRBR-155_2313,Invasive carcinoma of breast (disorder)
284,AI-DRBR-156_2361,Invasive carcinoma of breast (disorder)
285,AI-DRBR-158_1335,Invasive carcinoma of breast (disorder)
286,AI-DRBR-159_2229,Invasive carcinoma of breast (disorder)


In [25]:
#drop rows duplicate 
data.drop_duplicates(subset=['slide_id'], inplace=True)

In [26]:
data.to_csv(file_path, index=False)