# 读取两个npz文件格式

In [1]:
import numpy as np
from scipy.sparse import load_npz

# 假设文件名为 file1.npz 和 file2.npz
visium_npz = '/media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain/spot_ST/extract/20220329/0_1/spot_ST.npz'
visiumhd_npz = '/media/cbtil/T7 Shield/NMI/data/Visiumhdmousebrain4_8/spot_ST/extract/20240917mousebrain/0_0/spot_ST.npz'

# 加载两个 npz 文件 加载稀疏矩阵
visium = load_npz(visium_npz)
visiumhd = load_npz(visiumhd_npz)
  
visium = visium.toarray()  # 转为密集矩阵
visiumhd = visiumhd.toarray() 
print("visium ST data shape:", visium.shape)
print("visiumhd ST data shape:", visiumhd.shape)

reshaped_visium = visium.reshape(26,26,-1) # 26,26,32285
reshaped_visiumhd = visiumhd.reshape(26,26,-1) # 26,26,8049




visium ST data shape: (676, 32285)
visiumhd ST data shape: (676, 8049)


# 读取visiumhd 和 visium 的基因列表

In [2]:
import pandas as pd

# 读取 CSV 文件
visisum_list = pd.read_csv('/media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain/gene_groups.csv')
# 获取第一列并转换为列表
visium_list = visisum_list.iloc[:, 0].tolist()
# 打印结果
print(len(visium_list))

visiumhd_list = []
# 打开并读取 txt 文件
with open('/media/cbtil/T7 Shield/NMI/data/Visiumhdmousebrain4_8/gene_names.txt', 'r') as file:
    visiumhd_line = file.readlines()
    lines = [line.strip() for line in visiumhd_line if line.strip() != '']
    visiumhd_list = lines
    
# 打印结果
print(len(visiumhd_list))



32285
8049


# 模式匹配 （从visiumhd(8049) -> visium(32285)）

In [5]:
# 创建一个字典，用于存储 visisum_list 中每个元素的索引
visium_index_map = {value: idx for idx, value in enumerate(visium_list)}

# 创建一个空列表来存储 visiumhd_list 中每个元素的索引（如果存在）或者 -1
result_indices = []
unmatched_items = []

# 遍历 visiumhd_list，检查每个元素是否存在于 visium_index_map 中
for item in visiumhd_list:
    index = visium_index_map.get(item, -1)  # 如果找到，返回索引，否则返回 -1
    if index == -1:
        unmatched_items.append(item)  # 记录未匹配到的项
    result_indices.append(index)

# 打印匹配不上的内容
print("未匹配的内容：")
print(unmatched_items)

# 未匹配的内容：
# ['Aldoa_1', 'Arhgef4_1', 'Pakap_1', 'Pakap_2', 'Fam220a_2']

unmatched_indices_in_visiumhd = []
for item in unmatched_items:
    # 获取未匹配项在 visiumhd_list 中的索引
    unmatched_indices_in_visiumhd.append(visiumhd_list.index(item))

# 打印未匹配内容的索引
print("未匹配内容在 visiumhd_list 中的索引：")
print(unmatched_indices_in_visiumhd)


未匹配的内容：
['Aldoa_1', 'Arhgef4_1', 'Pakap_1', 'Pakap_2', 'Fam220a_2']
未匹配内容在 visiumhd_list 中的索引：
[100, 1503, 4121, 4655, 6214]


# 修改visium的32285到前8049让他们匹配上

In [None]:
matched_visium = np.full_like(reshaped_visiumhd, 0)  # 初始化匹配矩阵，默认值是 0

# 遍历 visiumhd 中每个通道，并通过映射表进行匹配
for i in range(reshaped_visiumhd.shape[2]):
    # 获取 visiumhd 中当前通道的映射索引
    index = result_indices[i]  # 假设 result_indices[i] 是 visium 中对应通道的索引
    
    if index != -1:
        # 如果找到了匹配，将 visiumhd 中的通道值替换为 visium 中的对应通道值
        matched_visium[..., i] = reshaped_visium[..., index]
        
# 打印匹配后的 visiumhd 的形状
print("Matched visiumhd shape:", matched_visium.shape)

# 检测 matched_visium 中哪些通道没有匹配上
unmatched_channels = []

# 遍历 matched_visium 的每个通道
for i in range(matched_visium.shape[2]):
    # 如果该通道的所有元素都是 -1，说明该通道没有匹配
    if np.all(matched_visium[..., i] == -1):
        unmatched_channels.append(i)

# 输出未匹配的通道索引
print("未匹配的通道索引：")
print(unmatched_channels)


Matched visiumhd shape: (26, 26, 8049)
未匹配的通道索引：
[100, 1503, 4121, 4655, 6214]


# 遍历匹配Visium的spot st下所有batch的npz

In [7]:
import os
import numpy as np
from scipy.sparse import load_npz
from pathlib import Path

# 根目录路径
root_dir = '/media/cbtil/T7 Shield/NMI/data/Visiumhdmousebrain4_8/spot_ST/extract/20240917mousebrain'

# 新目录的根路径
output_root_dir = root_dir.replace("Visiumhdmousebrain4_8", "Visium_mouse_brain_matched")

# 创建新的根目录
Path(output_root_dir).mkdir(parents=True, exist_ok=True)

# 遍历所有子文件夹
for subdir, dirs, files in os.walk(root_dir):
    # 检查是否有 'spot_ST.npz' 文件
    if 'spot_ST.npz' in files:
        # 构建当前文件的完整路径
        input_file_path = os.path.join(subdir, 'spot_ST.npz')
        
        # 加载 npz 文件
        visiumhd = load_npz(input_file_path)
        visiumhd_array = visiumhd.toarray()

        # 假设我们已经有了匹配的索引 (result_indices)，以及 reshaped_visium 数据
        reshaped_visiumhd = visiumhd_array.reshape(26, 26, -1)  # 26,26,8049
        matched_visium = np.full_like(reshaped_visiumhd, -1)  # 初始化匹配矩阵，默认值是 -1

        # 遍历 visiumhd 中每个通道，并通过映射表进行匹配
        for i in range(reshaped_visiumhd.shape[2]):
            index = result_indices[i]  # 假设 result_indices[i] 是 visium 中对应通道的索引
            if index != -1:
                # 如果找到了匹配，将 visiumhd 中的通道值替换为 visium 中的对应通道值
                matched_visium[..., i] = reshaped_visium[..., index]

        # 构建输出文件路径（创建新的文件夹）
        subdir_relative_path = os.path.relpath(subdir, root_dir)
        output_subdir = os.path.join(output_root_dir, subdir_relative_path)
        Path(output_subdir).mkdir(parents=True, exist_ok=True)
        
        # 输出路径
        output_file_path = os.path.join(output_subdir, 'spot_ST_matched.npz')
        
        # 保存匹配后的数据
        np.savez_compressed(output_file_path, matched_visium)
        
        print(f"Processed and saved: {output_file_path}")


Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/0_0/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/0_1/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/0_2/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/1_0/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/1_1/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/1_2/spot_ST_matched.npz
Processed and saved: /media/cbtil/T7 Shield/NMI/data/Visium_mouse_brain_matched/spot_ST/extract/20240917mousebrain/2_0/spot_ST_matched.npz
Processed and saved: /media