In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

from datetime import date
print(date.today())

2025-12-08



```bash
pmuscle_36_processed/
├── [953K]  cts_M_s36_g13573.tsv.gz
├── [ 700]  outlier_M_sg22.tsv
└── [ 224]  tissues_pmuscle36.tsv
```

In [5]:
sourcedir = '/mnt/disk7t/xwj/axolotl_rev/pmuscle_36/'
datadir = '/mnt/disk7t/xwj/axolotl_rev/pmuscle_36_processed/'
os.chdir(datadir)

In [11]:
outlier = pd.read_csv(f'{datadir}/outlier_M_sg22.tsv', sep='\t', index_col=0)

min_reads = 10
cts = pd.read_csv(f'{datadir}/cts_M_s36_g13573.tsv.gz', sep='\t', index_col=0)
cts.shape, outlier.shape

((13573, 36), (22, 3))

In [None]:
use = 'pmuscle36'
tissue_name_mapping = pd.DataFrame({
    'TISSUE_ID': ['M'],
    'TISSUE_NAME': ['muscle',]
})
tissues = pd.DataFrame(
    index = tissue_name_mapping.index,
    columns = ['TISSUE_ID','TISSUE_NAME','N_SAMPLE','N_GENE','CTS_FILE', 'OUTLIER_FILE',])

# 各组织cts表达矩阵和outlier
for idx, row in tissue_name_mapping.iterrows():
    tissue_id, tissue_name = row['TISSUE_ID'],  row['TISSUE_NAME']
    # (1) cts: expression matrix
    print( cts.shape, outlier.shape, outlier["Sample"].isin(cts.columns).sum(), outlier["Gene"].isin(cts.index).all(),)
    # (2) outlier: true outlier gene-sample pair in cts.    
    # sample meta table
    sample_list_interest = cts.columns.tolist()
    outlier_use = outlier.query('Sample in @sample_list_interest').copy()

    ctsfile = f'{datadir}/cts_{tissue_id}_s{cts.shape[1]}_g{cts.shape[0]}.tsv.gz'
    cts.to_csv(ctsfile, sep='\t')
    # outliers: part1 + part2
    outlierfile =f'{datadir}/outlier_{tissue_id}_sg{outlier_use.shape[0]}.tsv'
    outlier_use.to_csv(outlierfile, sep='\t')
    
    print(f'{tissue_name}/{tissue_id}/min_reads={min_reads} stable_ngene={cts.shape[0]} nsample={cts.shape[1]}')

    # print(outlierfile,ctsfile)
    tissues.loc[idx, :] = [ tissue_id, tissue_name, cts.shape[1], cts.shape[0],ctsfile, outlierfile]
    
tissues.to_csv(f'{datadir}/tissues_{use}.tsv', sep='\t')

(13573, 36) (22, 3) 22 True
muscle/M/min_reads=10 stable_ngene=13573 nsample=36


## prepare folders and config file

In [None]:
# 准备结果目录和流程所需config文件
workdir = '/mnt/disk7t/xwj/axolotl_rev/'

# level 1
output_path = f'{workdir}/result/dataset_pmuscle_36'
# level 2
samples_path = f'{output_path}/samples'
task_config_path = f'{output_path}/task_config'
task_output_path = f'{output_path}/task_output'
metric_output_path = f'{output_path}/metric'

# print(workdir, samples_path, task_config_path, task_output_path, metric_output_path)
os.system(f'mkdir -p {samples_path} {task_config_path} {task_output_path} {metric_output_path}')
# os.system(f'chmod --silent -R 777 {task_output_path}')

0

In [None]:
# prepare task config file.
# task_config table have columns indicating output filenames of different methods
for i, row in tissues.iterrows():
    
    t  = row.TISSUE_ID
    tn = row.TISSUE_NAME
    ns = row.N_SAMPLE
    ng = row.N_GENE
    ctsfile = row.CTS_FILE
    outlierfile = row.OUTLIER_FILE

    prefix = f't{i:02d}_{t}_s{ns}_g{ng}' # id, number of samples, number of genes.
    
    cols = ['Dname','cts','samples','MyMethod','OUTRIDER','ABEILLE','OUTSINGLE']
    tasks = [0] # list of parallel tasks
    task_config = pd.DataFrame(index=tasks, columns=cols)
    task_config.index.name = 'task'

    task = 0 # 默认是全部样本, 所以only one task for this tissue  
    task_config.loc[task, 'Dname' ] = t
    task_config.loc[task, 'cts' ] = ctsfile
    task_config.loc[task, 'samples' ] = f'{samples_path}/{prefix}.txt'
    # create filenames
    task_config.loc[task, 'MyMethod'] = f'{task_output_path}/{prefix}/{task:03d}_mymethod.txt.gz'
    task_config.loc[task, 'OUTRIDER'] = f'{task_output_path}/{prefix}/{task:03d}_outrider.txt.gz'
    task_config.loc[task, 'OUTSINGLE'] = f'{task_output_path}/{prefix}/{task:03d}_outsingle.txt.gz'
    task_config.loc[task, 'ABEILLE'] = f'{task_output_path}/{prefix}/{task:03d}_abeille.txt.gz'

    # 0. create config & output folder of parallel tasks
    task_config.to_csv(f'{task_config_path}/{prefix}.config',sep='\t')
    os.system(f'mkdir -p {task_output_path}/{prefix}')
    
    # 1. sample ids of parallel tasks
    # 将task specific样本列表作为one row添加到DataFrame中. 默认task是全部样本
    cts = pd.read_csv(ctsfile, sep='\t',index_col=0)
    all_samples_df = pd.DataFrame(data=cts.columns.T.tolist()).transpose()
    all_samples_df.index = task_config.index
    all_samples_df.to_csv(f'{samples_path}/{prefix}.txt',sep='\t')
    

## subsampeling genes 

In [15]:
genes_must_keep = [ g for g in cts.index if g.startswith('COL')]
print(genes_must_keep ), len(genes_must_keep)

['COL11A2', 'COL12A1', 'COL14A1', 'COL15A1', 'COL16A1', 'COL18A1', 'COL1A1', 'COL1A2', 'COL21A1', 'COL23A1', 'COL24A1', 'COL27A1', 'COL28A1', 'COL3A1', 'COL4A1', 'COL4A2', 'COL4A3', 'COL4A3BP', 'COL4A4', 'COL4A5', 'COL4A6', 'COL5A1', 'COL5A2', 'COL5A3', 'COL6A1', 'COL6A2', 'COL6A3', 'COL6A6', 'COL7A1', 'COL8A1', 'COL8A2', 'COLCA1', 'COLEC12', 'COLGALT1', 'COLGALT2', 'COLQ']


(None, 36)

In [None]:
gene_sizes = [ 12000 ]
n_replicates = 10  # 每个样本量重复10次
random_seeds = range(n_replicates)  # 使用0-9作为随机种子

# 从tissues中获取信息
for i, tissue_data in tissues.iterrows():
    # tissue_data = tissues.iloc[0]
    t = tissue_data['TISSUE_ID']  # TISSUE_ID
    tn = tissue_data['TISSUE_NAME']  # TISSUE_NAME
    ns = tissue_data['N_SAMPLE']  # N_SAMPLE
    ng = tissue_data['N_GENE']  # N_GENE
    ctsfile = tissue_data['CTS_FILE']  # CTS_FILE
    outlierfile = tissue_data['OUTLIER_FILE']  # OUTLIER_FILE

    # 创建基础前缀
    base_prefix = f't{i:02d}_{t}_s{ns}_g{ng}' # id, number of samples, number of genes.
    
    cts = pd.read_csv(ctsfile, sep='\t', index_col=0)
    all_genes = cts.index.tolist()
    all_samples = cts.columns.tolist()
    print(i, t, tn, cts.shape, len(all_genes))
    
    genes_must_keep = [ g for g in cts.index if g.startswith('COL')]
    print(genes_must_keep ), len(genes_must_keep)
    # positive samples
    # all negative samples
    negative_genes = list(set(all_genes) - set(genes_must_keep) )
    
    # 创建字典存储所有抽样结果
    sampled_configs = {}

    # 对每个样本量进行抽样
    for size in gene_sizes:
        # 存储当前样本量的所有抽样结果
        size_configs = {}
        
        for seed in random_seeds:
            n_negative = size - len(genes_must_keep)
            # 从阴性样本中随机抽样（数量 = 总样本量 - 阳性样本数量
            negative_selected = pd.Series(negative_genes).sample(n = n_negative, random_state=seed).tolist() #, size=n_negative, replace=False)

            # 合并阳性样本和随机选择的阴性样本
            selected_genes =  genes_must_keep +  negative_selected
            assert len(selected_genes) == len(set(selected_genes))
            print(i,t, size, seed, len(selected_genes),  len(genes_must_keep), genes_must_keep[:3], n_negative, negative_selected[:3], )
            
            # new cts output. pmuscle_36 take subsets of genes
            cts_selected = cts.loc[selected_genes].copy()
            cts_selected_file = f'{ctsfile}.size{size}.seed{seed}.tsv.gz'
            cts_selected.to_csv(cts_selected_file, sep='\t')
            
            # 创建前缀
            prefix = f'{base_prefix}_size{size}_seed{seed}'
            
            # 创建输出目录
            output_dir = os.path.join(task_output_path, prefix)
            os.makedirs(output_dir, exist_ok=True)
            
            # 创建样本列表文件
            samples_file = os.path.join(samples_path, f'{prefix}.txt')
            samples = pd.DataFrame(all_samples).transpose()
            samples.index.name = 'task'
            samples.to_csv(samples_file, sep='\t', header=True)
            
            # 创建config文件
            cols = ['Dname', 'cts', 'samples', 'MyMethod', 'OUTRIDER', 'ABEILLE', 'OUTSINGLE']
            task_config = pd.DataFrame(columns=cols)
            task_config.index.name = 'task'
            # 填充config数据
            task = 0
            task_config.loc[0, 'Dname'] = t
            task_config.loc[0, 'cts'] = cts_selected_file
            task_config.loc[0, 'samples'] = samples_file
            task_config.loc[0, 'MyMethod'] = os.path.join(output_dir, f'{task:03d}_mymethod.txt.gz')
            task_config.loc[0, 'OUTRIDER'] = os.path.join(output_dir, f'{task:03d}_outrider.txt.gz')
            task_config.loc[0, 'OUTSINGLE'] = os.path.join(output_dir, f'{task:03d}_outsingle.txt.gz')
            task_config.loc[0, 'ABEILLE'] = os.path.join(output_dir, f'{task:03d}_abeille.txt.gz')
            # {prefix}/{task:03d}
            
            # 保存config文件
            config_file = os.path.join(task_config_path, f'{prefix}.config')
            task_config.to_csv(config_file, sep='\t', index=False)
            
            # 存储结果
            size_configs[f'seed_{seed}'] = {
                'config': task_config,
                'selected_genes': selected_genes,
                'selected_samples': all_samples,
                'output_dir': output_dir
            }
        
        # 将当前样本量的所有结果存入总字典
        sampled_configs[f'size_{size}'] = size_configs

    # sampled_configs现在包含了所有抽样结果和对应的配置文件
    # 结构为：{sample_size: {seed: {'config': df, 'selected_genes': list, 'output_dir': str}}, ...}
    output_file = os.path.join(task_config_path, f'{base_prefix}_sampled_configs_dict.pkl')
    # 保存sampled_configs字典
    import pickle
    with open(output_file, 'wb') as f:
        pickle.dump(sampled_configs, f)

    print(f"sampled_configs已保存到: {output_file}")        