Data preprocessing

In [None]:
import cv2
import os
import pandas as pd
import glob
import numpy as np
def generate_patch(patch_dir, image, image_name, x_coord, y_coord, names, spot_size=224):
    half_size = int(spot_size / 2)
    for i in range(x_coord.shape[0]):
        print(f"Processing patch at coord: ({x_coord[i]}, {y_coord[i]})")
        if (0 <= x_coord[i] - half_size < image.shape[1] and 
            0 <= x_coord[i] + half_size <= image.shape[1] and
            0 <= y_coord[i] - half_size < image.shape[0] and 
            0 <= y_coord[i] + half_size <= image.shape[0]):

            top = max(0, y_coord[i] - half_size)
            bottom = min(image.shape[0], y_coord[i] + half_size)
            left = max(0, x_coord[i] - half_size)
            right = min(image.shape[1], x_coord[i] + half_size)

            patch = image[top:bottom, left:right]
            
            if patch.size == 0:
                print(f"Empty patch at coord: ({x_coord[i]}, {y_coord[i]})")
                continue
            
            patch_img_dir = os.path.join(patch_dir, image_name)
            os.makedirs(patch_img_dir, exist_ok=True)

            try:
                patch_resized = cv2.resize(patch, (224, 224), interpolation=cv2.INTER_CUBIC)
                cv2.imwrite(os.path.join(patch_img_dir, f"{names[i]}.jpg"), patch_resized)
                print(f"Saved patch at coord: ({x_coord[i]}, {y_coord[i]})")
            except Exception as e:
                print(f"Failed to resize or save patch at coord: ({x_coord[i]}, {y_coord[i]}): {e}")
        else:
            print(f"Patch coordinates out of image bounds: ({x_coord[i]}, {y_coord[i]})")


def split_spatial(root_dir, spot_size=224):
    dir = './GSE144240_RAW/'
    patients = ['P2', 'P5', 'P9', 'P10']
    reps = ['rep1', 'rep2', 'rep3']
    names = []
    for i in patients:
        for j in reps:
            names.append(i+'_ST_'+j)
    patch_dir = './preprocessed_data/' + 'ST-patches/'
    os.makedirs(patch_dir, exist_ok=True)
    
    for img_name in names:
        img_path = glob.glob(dir+'*'+img_name+'.jpg')[0]
        img = cv2.imread(img_path)
        print("Reading image: ", img_name)

        coord_path = glob.glob(dir+'*spot*'+img_name+'.tsv')[0]
        coords = pd.read_csv(coord_path,sep='\t')

        exp_path = glob.glob(dir+'*'+img_name+'_stdata.tsv')[0]
        gene_exp = pd.read_csv(exp_path,sep='\t',index_col=0)
        
        
        coords.index = coords['x'].astype(str) + 'x' + coords['y'].astype(str)
        
        print("Gene expression indices:", len(gene_exp.index))
        print("Coords indices:", len(coords.index))
        
        common_indices = gene_exp.index.intersection(coords.index)
        print("Common indices:", len(common_indices))
        coords = coords.loc[common_indices]
        gene_exp = gene_exp.loc[common_indices]
        
        # 提取坐标点
        x_coord = coords['pixel_x'].round(0).astype(int)
        y_coord = coords['pixel_y'].round(0).astype(int)

        generate_patch(patch_dir, img, img_name, x_coord, y_coord, gene_exp.index, spot_size)

        

split_spatial(".")

In [None]:
# Reinhard color normalization for HE-stained histological images using histomicsTK tools.
import os
import PIL
import skimage.io
import skimage.color
import histomicstk as htk

def nmzd_reinhard_rescale(input_image_file, nmzd_path, barcode):
    """
    Reinhard图像颜色标准化
    使用 'ref_HE.png' 作为参考
    """
    rescale_size = 200
    im_input = skimage.io.imread(input_image_file)[:, :, :3]
    
    ref_image_file = 'ref_HE.png' 
    im_reference = skimage.io.imread(ref_image_file)[:, :, :3]
   
    mean_ref, std_ref = htk.preprocessing.color_conversion.lab_mean_std(im_reference)
    
    im_nmzd = htk.preprocessing.color_normalization.reinhard(im_input, mean_ref, std_ref)
    pil_img = PIL.Image.fromarray(im_nmzd)
    
    pil_img.save(os.path.join(nmzd_path, barcode+".jpg"))


directory_path = './preprocessed_data/ST-patches'

all_items = os.listdir(directory_path)

tissue_list = [item for item in all_items if os.path.isdir(os.path.join(directory_path, item))]
print(len(tissue_list))

for tissue_name in tissue_list:
    source_path = os.path.join(directory_path,tissue_name)
    save_root_path = "./preprocessed_data/patches_nmzd"
    nmzd_path = os.path.join(save_root_path, tissue_name)
    if not os.path.exists(nmzd_path):
        os.makedirs(nmzd_path)

    for filename in os.listdir(source_path):
        if filename.endswith('jpg'):
            try:
                barcode = filename[:-4]
                input_img_file = os.path.join(source_path, filename)
                nmzd_reinhard_rescale(input_img_file, nmzd_path, barcode)

            except:
                print("Error occured in %s" % os.path.join(source_path, filename))
    
    print("End of normalization & rescaling of %s" % tissue_name)

12
End of normalization & rescaling of P10_ST_rep3
End of normalization & rescaling of P9_ST_rep3
End of normalization & rescaling of P5_ST_rep1
End of normalization & rescaling of P5_ST_rep3
End of normalization & rescaling of P9_ST_rep1
End of normalization & rescaling of P2_ST_rep3
End of normalization & rescaling of P2_ST_rep1
End of normalization & rescaling of P5_ST_rep2
End of normalization & rescaling of P2_ST_rep2
End of normalization & rescaling of P9_ST_rep2
End of normalization & rescaling of P10_ST_rep1
End of normalization & rescaling of P10_ST_rep2


In [3]:
directory_path = './preprocessed_data/ST-patches'

all_items = os.listdir(directory_path)

tissue_list = [item for item in all_items if os.path.isdir(os.path.join(directory_path, item))]
print(len(tissue_list))

for tissue_name in tissue_list:
    print(tissue_name)
    tissue = [item for item in os.listdir('./preprocessed_data/ST-patches/'+ tissue_name)]
    print(len(tissue))

12
P10_ST_rep3
462
P9_ST_rep3
1182
P5_ST_rep1
590
P5_ST_rep3
521
P9_ST_rep1
1145
P2_ST_rep3
638
P2_ST_rep1
666
P5_ST_rep2
521
P2_ST_rep2
646
P9_ST_rep2
1071
P10_ST_rep1
608
P10_ST_rep2
621


In [None]:
cd Hover-net/hover_net-master/hover_net-master/

python run_infer.py \
--gpu='0' \
--nr_types=6 \
--type_info_path=type_info.json \
--batch_size=64 \
--model_mode=fast \
--model_path=pretrained/hovernet_fast_pannuke_type_tf2pytorch.tar \
--nr_inference_workers=8 \
--nr_post_proc_workers=16 \
tile \
--input_dir=../../../dataset/her2st/preprocessed_data/ST-patches/A1 \
--output_dir=../../../dataset/her2st/preprocessed_data/hover_seg/A1 \
--mem_usage=0.1 \
--draw_dot \
--save_qupath



图像特征提取

In [None]:
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel
import os
import numpy as np


processor = AutoImageProcessor.from_pretrained("/d/zhoujl/my_model/model_train/phikon-v2")
model = AutoModel.from_pretrained("/d/zhoujl/my_model/model_train/phikon-v2")
model.eval()


source_patches_dir = "./preprocessed_data/patches_nmzd"
feature_save_dir = "./preprocessed_data/precomputed_features"
os.makedirs(feature_save_dir, exist_ok=True)


for slice_folder in os.listdir(source_patches_dir):
    slice_path = os.path.join(source_patches_dir, slice_folder)
    
    if not os.path.isdir(slice_path):
        continue
    
    
    slice_feature_dir = os.path.join(feature_save_dir, slice_folder)
    os.makedirs(slice_feature_dir, exist_ok=True)
    
    
    for image_file in os.listdir(slice_path):
        if not image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
            continue
        
        image_path = os.path.join(slice_path, image_file)
        
        try:
            
            image = Image.open(image_path) 
            
           
            inputs = processor(image, return_tensors="pt")
            
           
            with torch.inference_mode():
                outputs = model(**inputs)
                features = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() 
            
           
            feature_file = os.path.join(slice_feature_dir, f"{os.path.splitext(image_file)[0]}.npy")
            np.save(feature_file, features)
            
            print(f"Feature extracted and saved from {image_path} to {feature_file}")
        
        except Exception as e:
            print(f"Failed to process {image_path}: {e}")

print("All features extracted and saved according to the original classification!")

  from .autonotebook import tqdm as notebook_tqdm
2025-08-28 16:48:49.687094: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-28 16:48:49.700297: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-28 16:48:49.714679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-28 16:48:49.718805: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-28 16:48:49.7

Feature extracted and saved from /d/zhoujl/my_model/dataset/cscc/preprocessed_data/patches_nmzd/P10_ST_rep3/31x35.jpg to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/precomputed_features/P10_ST_rep3/31x35.npy
Feature extracted and saved from /d/zhoujl/my_model/dataset/cscc/preprocessed_data/patches_nmzd/P10_ST_rep3/18x26.jpg to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/precomputed_features/P10_ST_rep3/18x26.npy
Feature extracted and saved from /d/zhoujl/my_model/dataset/cscc/preprocessed_data/patches_nmzd/P10_ST_rep3/25x37.jpg to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/precomputed_features/P10_ST_rep3/25x37.npy
Feature extracted and saved from /d/zhoujl/my_model/dataset/cscc/preprocessed_data/patches_nmzd/P10_ST_rep3/21x23.jpg to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/precomputed_features/P10_ST_rep3/21x23.npy
Feature extracted and saved from /d/zhoujl/my_model/dataset/cscc/preprocessed_data/patches_nmzd/P10_ST_rep3/30x24.jpg to /d/zhoujl/my_model/data

spot类型选择

In [None]:
import os
import pandas as pd

directory_path = './preprocessed_data/ST-patches'

all_items = os.listdir(directory_path)


sample_list = [item for item in all_items if os.path.isdir(os.path.join(directory_path, item))]
print(len(sample_list))
print(sample_list[0])

file_extension = '.tsv'


all_types = ['nolabe', 'necros', 'neopla', 'inflam', 'connec', 'no-neo']

def calculate_proportions(file_path):
    df = pd.read_csv(file_path, sep='\t')
    total_count = len(df)
    
  
    proportions = df['name'].value_counts(normalize=True) * 100
    
    
    for name_type in all_types:
        if name_type not in proportions:
            proportions[name_type] = 0.0
            
    return proportions.to_dict(), total_count

def save_proportions_to_tsv(proportions, total_count, output_path):
    with open(output_path, 'w') as f:
        f.write('name\tproportion\ttotal_count\n')
        for name in all_types:
            proportion = proportions.get(name, 0.0)
            f.write(f'{name}\t{proportion}\t{total_count}\n')

def summarize_spot_types(output_dir, summary_output_path):
    spot_types = []
    
    
    for filename in os.listdir(output_dir):
        if filename.endswith('_proportions.tsv'):
            file_path = os.path.join(output_dir, filename)
            
           
            df = pd.read_csv(file_path, sep='\t')
            
           
            max_proportion_row = df[df['proportion'] == df['proportion'].max()]
            max_proportion_name = max_proportion_row.iloc[0]['name']
            
            
            spot_name = os.path.splitext(filename)[0].replace('_proportions', '')
            
            
            spot_types.append({'spot': spot_name, 'type': max_proportion_name})
    
    
    summary_df = pd.DataFrame(spot_types)
    summary_df.to_csv(summary_output_path, sep='\t', index=False)

for sample in sample_list:
    input_dir = './preprocessed_data/hover_seg/'+sample+'/qupath/'
    output_dir = './preprocessed_data/spots_type/'+sample+'/'
    os.makedirs(output_dir, exist_ok=True)

   
    for filename in os.listdir(input_dir):
        if filename.endswith(file_extension):
            file_path = os.path.join(input_dir, filename)
            
           
            proportions, total_count = calculate_proportions(file_path)
            
            
            output_filename = os.path.splitext(filename)[0] + '_proportions.tsv'
            output_path = os.path.join(output_dir, output_filename)
            
           
            save_proportions_to_tsv(proportions, total_count, output_path)
            print(f"Processed {filename} and saved results to {output_path}")

    
    summary_output_path = os.path.join(output_dir, 'summary_spot_types.tsv')
    summarize_spot_types(output_dir, summary_output_path)
    print(f"Summary of spot types saved to {summary_output_path}")

12
P10_ST_rep3
Processed 41x43.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/41x43_proportions.tsv
Processed 22x26.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/22x26_proportions.tsv
Processed 24x34.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/24x34_proportions.tsv
Processed 16x18.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/16x18_proportions.tsv
Processed 35x45.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/35x45_proportions.tsv
Processed 51x31.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/51x31_proportions.tsv
Processed 41x21.tsv and saved results to /d/zhoujl/my_model/dataset/cscc/preprocessed_data/spots_type/P10_ST_rep3/41x21_proportions.tsv
Processed 47x31.tsv and saved res