In [3]:
import numpy as np
from biom.table import Table

d_a = np.asarray([[2, 0], [6, 1], [1, 5]])
t_a = Table(d_a, ['O1', 'O2', 'O4'], ['S1', 'S4'])
d_b = np.asarray([[4, 5], [0, 3], [10, 10]])
t_b = Table(d_b, ['O1', 'O2', 'O3'], ['S2', 'S7'])
merged_table = t_a.merge(t_b)
print(merged_table)  

# Constructed from biom file
#OTU ID	S1	S2	S4	S7
O1	2.0	4.0	0.0	5.0
O2	6.0	0.0	1.0	3.0
O3	0.0	10.0	0.0	10.0
O4	1.0	0.0	5.0	0.0


In [4]:
import os
import glob
import biom
import shutil
from biom import load_table, Table

# 配置路径
data_root = "/home/dongbiao/all_study/data_new"
output_root = "/home/dongbiao/all_study/result_new"

def filter_low_freq_feature(current_table):
  print(f"过滤低频样本和特征，当前维度: {current_table.shape}")
  sid = current_table.ids(axis="sample")
  reads_depth = current_table.sum(axis="sample")
  current_table = current_table.filter(
      sid[reads_depth > 5000], axis='sample', inplace=False).remove_empty()

  fid = current_table.ids(axis="observation")
  prevalence = current_table.nonzero_counts(axis="observation")
  current_table = current_table.filter(
      fid[prevalence > 2], axis='observation', inplace=False).remove_empty()
  print(f"过滤低频样本和特征，处理后的维度: {current_table.shape}")
  return current_table

def test_to_aligment_train(test_table, train_tables):
  # print(test_table.shape)
  # print(train_tables.shape)
  # 训练集去空处理，测试集跟训练集进行维度对齐

  # 获取训练集的所有特征ID，重新索引测试表的行（特征）以匹配训练集，缺失的特征填充0
  train_features = train_tables.ids(axis='observation')
  test_df = test_table.to_dataframe()
  test_df = test_df.reindex(index=train_features, fill_value=0)

  # 创建新的BIOM表，确保特征顺序与训练集一致，过滤掉测试集中不在训练集中的 feature
  test_table = Table(
      data=test_df.values,
      observation_ids=test_df.index.tolist(),
      sample_ids=test_df.columns.tolist(),
      # observation_metadata=None,  # 可根据需要添加metadata处理
      # sample_metadata=test_table.metadata(axis='sample')
  )

  print(f"测试集样本数: {test_table.shape[1]}, 特征数: {test_table.shape[0]},训练集样本数: {train_tables.shape[1]}, 特征数: {train_tables.shape[0]}")

  return test_table,train_tables


for disease_id in os.listdir(data_root):
  disease_dir = os.path.join(data_root, disease_id)
  output_disease_dir = os.path.join(output_root, disease_id)

  # 跳过非目录和隐藏目录
  if not os.path.isdir(disease_dir) or disease_id.startswith('.'):
      continue

 # 只处理PD和SZ
  if disease_id not in [ "PD"]:
    continue

  # 获取所有研究数据文件
  all_bioms = glob.glob(os.path.join(disease_dir, "*.biom"))
  
  # 执行留一法拆分
  for leave_path in all_bioms:
    study_code = os.path.splitext(os.path.basename(leave_path))[0]
    output_study_dir = os.path.join(output_disease_dir, study_code)

    os.makedirs(output_study_dir, exist_ok=True)

    # 拷贝元数据文件（新增部分）
    src_metadata = os.path.join(disease_dir, "metadata.tsv")
    dst_metadata = os.path.join(output_disease_dir, "metadata.tsv")
    if os.path.exists(src_metadata):
        os.makedirs(output_disease_dir, exist_ok=True)
        shutil.copy2(src_metadata, dst_metadata)
    else:
        print(f"警告：{disease_id} 缺失元数据文件")
    
    # 构建训练集
    train_tables = []
    for train_path in all_bioms:
        if train_path == leave_path:
            continue
        try:
          train_study_code = os.path.splitext(os.path.basename(train_path))[0]
          if disease_id == "ASD" and train_study_code == "PRJEB11419":
            print(f"{study_code}研究中去除了{disease_id} {train_study_code}")
            continue
          # if disease_id == "PD" and train_study_code == "PRJNA601994":
          #   print(f"{study_code}研究中去除了{disease_id} {train_study_code}")
          #   continue
          print(f"加载数据集 {train_path}:")
          current_table = load_table(train_path)
          train_tables.append(current_table)
        except Exception as e:
            print(f"加载失败 {train_path}: {str(e)}")
            continue
    
    # 合并训练数据
    if not train_tables:
        print(f"警告：{disease_id} 只有一个样本，无法进行留一法拆分")
        continue

    merged_train = filter_low_freq_feature(train_tables[0]) 
    for tbl in train_tables[1:]:
        tbl = filter_low_freq_feature(tbl)
        merged_train = merged_train.merge(tbl)
    merged_train = merged_train.remove_empty()
    # 保存测试集副本
    test_table = load_table(leave_path)
    test_table = filter_low_freq_feature(test_table)

    test_table, merged_train = test_to_aligment_train(test_table, merged_train)

    # if os.path.exists(output_study_dir):
    #   shutil.rmtree(output_study_dir)
    #   print(f"已删除旧目录: {output_study_dir}")

    with biom.util.biom_open(f'{output_study_dir}/train_loo.biom', 'w') as f:
      merged_train.to_hdf5(f, "train")
    with biom.util.biom_open(f'{output_study_dir}/test_loo.biom', 'w') as f:
      test_table.to_hdf5(f, "test")

print("Processing completed with biom library integration.")

加载数据集 /home/dongbiao/all_study/data_new/PD/PRJEB30615.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA742875.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA510730.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJEB27564.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA594156.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJEB14674.biom:
过滤低频样本和特征，当前维度: (2056, 256)
过滤低频样本和特征，处理后的维度: (1393, 255)
过滤低频样本和特征，当前维度: (3344, 171)
过滤低频样本和特征，处理后的维度: (1437, 171)
过滤低频样本和特征，当前维度: (14116, 201)
过滤低频样本和特征，处理后的维度: (1624, 201)
过滤低频样本和特征，当前维度: (9728, 265)
过滤低频样本和特征，处理后的维度: (5514, 253)
过滤低频样本和特征，当前维度: (2338, 299)
过滤低频样本和特征，处理后的维度: (1167, 288)
过滤低频样本和特征，当前维度: (14116, 327)
过滤低频样本和特征，处理后的维度: (1442, 327)
过滤低频样本和特征，当前维度: (14116, 825)
过滤低频样本和特征，处理后的维度: (3283, 825)
测试集样本数: 825, 特征数: 7908,训练集样本数: 1495, 特征数: 7908
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA601994.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA742875.biom:
加载数据集 /home/dongbiao/all_study/data_new/PD/PRJNA510730.biom

In [None]:
import os
import glob
import biom
import shutil
from biom import load_table, Table

# 配置路径
data_root = "/home/dongbiao/all_study/data_new"
output_root = "/home/dongbiao/all_study/result_new"

def filter_low_freq_feature(current_table):
  print(f"过滤低频样本和特征，当前维度: {current_table.shape}")
  sid = current_table.ids(axis="sample")
  reads_depth = current_table.sum(axis="sample")
  current_table = current_table.filter(
      sid[reads_depth > 5000], axis='sample', inplace=False).remove_empty()

  fid = current_table.ids(axis="observation")
  prevalence = current_table.nonzero_counts(axis="observation")
  current_table = current_table.filter(
      fid[prevalence > 2], axis='observation', inplace=False).remove_empty()
  print(f"过滤低频样本和特征，处理后的维度: {current_table.shape}")
  return current_table

def test_to_aligment_train(test_table, train_tables):
  # print(test_table.shape)
  # print(train_tables.shape)
  # 训练集去空处理，测试集跟训练集进行维度对齐

  # 获取训练集的所有特征ID，重新索引测试表的行（特征）以匹配训练集，缺失的特征填充0
  train_features = train_tables.ids(axis='observation')
  test_df = test_table.to_dataframe()
  test_df = test_df.reindex(index=train_features, fill_value=0)

  # 创建新的BIOM表，确保特征顺序与训练集一致，过滤掉测试集中不在训练集中的 feature
  test_table = Table(
      data=test_df.values,
      observation_ids=test_df.index.tolist(),
      sample_ids=test_df.columns.tolist(),
      # observation_metadata=None,  # 可根据需要添加metadata处理
      # sample_metadata=test_table.metadata(axis='sample')
  )

  print(f"测试集样本数: {test_table.shape[1]}, 特征数: {test_table.shape[0]},训练集样本数: {train_tables.shape[1]}, 特征数: {train_tables.shape[0]}")

  return test_table,train_tables


for disease_id in os.listdir(data_root):
  disease_dir = os.path.join(data_root, disease_id)
  output_disease_dir = os.path.join(output_root, disease_id)

  # 跳过非目录和隐藏目录
  if not os.path.isdir(disease_dir) or disease_id.startswith('.'):
      continue

 # 只处理PD和SZ
  if disease_id not in [ "PD"]:
    continue

  # 获取所有研究数据文件
  all_bioms = glob.glob(os.path.join(disease_dir, "*.biom"))
  
  # 执行留一法拆分
  for leave_path in all_bioms:
    study_code = os.path.splitext(os.path.basename(leave_path))[0]
    output_study_dir = os.path.join(output_disease_dir, study_code)

    os.makedirs(output_study_dir, exist_ok=True)

    # 拷贝元数据文件（新增部分）
    src_metadata = os.path.join(disease_dir, "metadata.tsv")
    dst_metadata = os.path.join(output_disease_dir, "metadata.tsv")
    if os.path.exists(src_metadata):
        os.makedirs(output_disease_dir, exist_ok=True)
        shutil.copy2(src_metadata, dst_metadata)
    else:
        print(f"警告：{disease_id} 缺失元数据文件")
    
    # 构建训练集
    train_tables = []
    for train_path in all_bioms:
        if train_path == leave_path:
            continue
        try:
          train_study_code = os.path.splitext(os.path.basename(train_path))[0]
          if disease_id == "ASD" and train_study_code == "PRJEB11419":
            print(f"{study_code}研究中去除了{disease_id} {train_study_code}")
            continue
          # if disease_id == "PD" and train_study_code == "PRJNA601994":
          #   print(f"{study_code}研究中去除了{disease_id} {train_study_code}")
          #   continue
          print(f"加载数据集 {train_path}:")
          current_table = load_table(train_path)
          train_tables.append(current_table)
        except Exception as e:
            print(f"加载失败 {train_path}: {str(e)}")
            continue
    
    # 合并训练数据
    if not train_tables:
        print(f"警告：{disease_id} 只有一个样本，无法进行留一法拆分")
        continue

    merged_train = filter_low_freq_feature(train_tables[0]) 
    for tbl in train_tables[1:]:
        tbl = filter_low_freq_feature(tbl)
        merged_train = merged_train.merge(tbl)
    merged_train = merged_train.remove_empty()
    # 保存测试集副本
    test_table = load_table(leave_path)
    test_table = filter_low_freq_feature(test_table)

    test_table, merged_train = test_to_aligment_train(test_table, merged_train)

    # if os.path.exists(output_study_dir):
    #   shutil.rmtree(output_study_dir)
    #   print(f"已删除旧目录: {output_study_dir}")

    with biom.util.biom_open(f'{output_study_dir}/train_loo.biom', 'w') as f:
      merged_train.to_hdf5(f, "train")
    with biom.util.biom_open(f'{output_study_dir}/test_loo.biom', 'w') as f:
      test_table.to_hdf5(f, "test")

print("Processing completed with biom library integration.")

In [23]:
import biom
import pandas as pd

def load_aligned_sums(biom_path, metadata_path, fill_missing=0):
    """
    对齐BIOM和metadata的样本，处理缺失样本
    :param fill_missing: 缺失样本填充值（建议0或np.nan）
    :return: 对齐后的总丰度序列
    """
    # 加载元数据
    metadata = pd.read_csv(metadata_path, sep='\t')
    metadata_samples = set(metadata['sample'])
    
    # 加载BIOM数据
    table = biom.load_table(biom_path)
    biom_df = table.to_dataframe()
    biom_samples = set(biom_df.index)
    
    # 找出不匹配的样本
    feature_sum = biom_df.T.sum(axis=1) 


    feature_sum = feature_sum.mean()
    print(f"总丰度序列：{feature_sum}")
    return feature_sum

# 示例用法
metadata_path = '/home/dongbiao/all_study/result_new/ASD/metadata.tsv'
method1_sums = load_aligned_sums('/home/dongbiao/all_study/result_new/ASD/PRJEB11419/test_loo.biom', metadata_path, fill_missing=0)
method2_sums = load_aligned_sums('/home/dongbiao/all_study/data/Agp_Austim/test_loo.biom', metadata_path, fill_missing=0)

# 合并结果

# 添加元数据信息（可选）
# metadata = pd.read_csv(metadata_path,sep='\t').set_index('sample')
# result_df = result_df.join(metadata, how='left')

print("\n最终结果前5行：")
print(result_df)

# # 加载两个数据集
# df1, metadata = load_biom_data('/home/dongbiao/all_study/result_new/ASD/PRJNA578223/test_loo.biom', '/home/dongbiao/all_study/result_new/ASD/metadata.tsv')
# df2, _ = load_biom_data('/home/dongbiao/all_study/result/PRJNA578223/test_loo.biom', '/home/dongbiao/all_study/result_new/ASD/metadata.tsv')



总丰度序列：27764.012269938652
总丰度序列：7316.02027027027

最终结果前5行：
             method1  method2
SRR10305328  39769.0  40968.0
SRR10305329  43962.0  13535.0
SRR10305330  46034.0      NaN
SRR10305331  37479.0    476.0
SRR10305332  34912.0  11942.0
...              ...      ...
SRR10305419  48980.0  10025.0
SRR10305420  48069.0  20487.0
SRR10305421  65539.0   5870.0
SRR10305422  88127.0  35072.0
SRR10305423  56769.0  39751.0

[96 rows x 2 columns]
