**True Outliers**
define have outliers AE defects 49, from S2 & S4.
Supplemental Tables of paper [3]   
(Table S2. Summary of cases diagnosed via RNA-seq,   
Table S4. Summary of WES-diagnosed cases with an RNA-defect)  
were manually merged. All aberrant expression events (labeled as ‘AE’ in the original publication) were extracted and used as 49 true outliers of the pfib dataset.

note:  
GeneCards Symbol: MICOS13  == C19orf70  
GeneCards Symbol: IARS1 == IARS  

In [11]:
import os
import seaborn as sns
import numpy as np
import pandas as pd
import random
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from datetime import date
import networkx as nx
from tqdm import tqdm

```bash

/mnt/disk7t/xwj/axolotl_rev/pfib_423/fib_ns--hg19--gencode34
├── [ 370]  DESCRIPTION.txt
├── [6.3M]  geneCounts.tsv.gz
├── [ 34M]  k_j_counts.tsv.gz
├── [ 13M]  k_theta_counts.tsv.gz
├── [ 32M]  n_psi3_counts.tsv.gz
├── [ 32M]  n_psi5_counts.tsv.gz
├── [ 46M]  n_theta_counts.tsv.gz
└── [7.7K]  sample_annotation.tsv
/mnt/disk7t/xwj/axolotl_rev/pfib_423/fib_ss--hg19--gencode34
├── [ 482]  DESCRIPTION.txt
├── [ 10M]  geneCounts.tsv.gz
├── [ 61M]  k_j_counts.tsv.gz
├── [ 25M]  k_theta_counts.tsv.gz
├── [ 57M]  n_psi3_counts.tsv.gz
├── [ 57M]  n_psi5_counts.tsv.gz
├── [ 86M]  n_theta_counts.tsv.gz
└── [ 13K]  sample_annotation.tsv


'/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed/'
df_true49.txt (outliers)
/mnt/disk7t/xwj/axolotl_rev/pfib_423/gencode.v44lift37.basic.annotation.gtf
```

## read known outliers and process counts 

In [3]:
gtf_file = "/mnt/disk7t/xwj/axolotl_rev/pfib_423/gencode.v44lift37.basic.annotation.gtf"
# GTF 文件的格式是固定的，关注第 9 列，它包含了基因的注释信息
annotations = []
with open(gtf_file, "r") as file:
    for line in file:
        if line.startswith("#"):  # 跳过注释行
            continue
        fields = line.strip().split("\t")
        if fields[2] == "gene":  # 只处理基因条目
            attributes = fields[8].split(";")
            gene_id = None
            gene_name = None
            for attr in attributes:
                if "gene_id" in attr:
                    gene_id = attr.split(" ")[1].strip('"')
                elif "gene_name" in attr:
                    gene_name = attr.split(" ")[2].strip('"')
                else:
                    continue
            if gene_id and gene_name:
                annotations.append((gene_id, gene_name))
                # break  # 只需要前两个字段

# 转换为 DataFrame
df = pd.DataFrame(annotations, columns=["Ensembl Gene ID", "Gene Symbol"])
df['ensg'] = df['Ensembl Gene ID'].str.split('.').str[0]
gtf = df.drop_duplicates(subset= ['Gene Symbol', 'ensg'] )
gtf.index = gtf['ensg']

# 输出结果
print(gtf.head())
# 保存为 CSV 文件
# df.to_csv("gene_id_to_symbol.csv", index=False)

                      Ensembl Gene ID  Gene Symbol             ensg
ensg                                                               
ENSG00000223972   ENSG00000223972.6_6      DDX11L1  ENSG00000223972
ENSG00000227232   ENSG00000227232.5_5       WASH7P  ENSG00000227232
ENSG00000243485  ENSG00000243485.5_11  MIR1302-2HG  ENSG00000243485
ENSG00000237613   ENSG00000237613.2_6      FAM138A  ENSG00000237613
ENSG00000268020   ENSG00000268020.3_5       OR4G4P  ENSG00000268020


In [5]:
workdir = '/mnt/disk7t/xwj/axolotl_rev/'

datadir = '/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed/'
# df_true49.txt (outliers)
# 文献提供的异常基因label
file = f'{datadir}/df_true49.txt'
outlier = pd.read_csv(file, sep="\t", index_col=None)
if False:
    # Supplemental Table output
    outlier.to_excel(f'{workdir}/result/table/s3_pfib_outlier49.xlsx', 
                              sheet_name='s3_pfib_outlier49',index=True)
outlier.head()

Unnamed: 0,Sample,Sex,Gene,RNAdefects,TableName
0,R80184,M,ALDH18A1,"AE, MAE",S2
1,R60537,M,ATP6AP1,"AE, Var",S2
2,R62943,F,MICOS13,"AE, AS",S2
3,R96820,F,CLPP,"AE, AS",S2
4,R77611,F,DLD,"AE, MAE",S2


In [None]:
# split data
use = 'split'

tissue_name_mapping = pd.DataFrame({
    'TISSUE_ID': ['FBSS','FBNS'],
    'TISSUE_NAME': ['Fibroblast_Stranded','Fibroblast_NonStranded'],
    'file' : ['/mnt/disk7t/xwj/axolotl_rev/pfib_423/fib_ss--hg19--gencode34/geneCounts.tsv.gz', 
             '/mnt/disk7t/xwj/axolotl_rev/pfib_423/fib_ns--hg19--gencode34/geneCounts.tsv.gz'],
})
tissues = pd.DataFrame(
    index = tissue_name_mapping.index,
    columns = ['TISSUE_ID','TISSUE_NAME','N_SAMPLE','N_GENE','CTS_FILE', 'OUTLIER_FILE',])

# 各组织cts表达矩阵和outlier
for idx, row in tissue_name_mapping.iterrows():
    tissue_id, tissue_name = row['TISSUE_ID'],  row['TISSUE_NAME']
    file = row['file']
    # (1) cts: expression matrix
    # sample_list = s_anno.query('group == @group').index.tolist()
    # cts = cts_raw.loc[:, sample_list].copy()
    cts_raw = pd.read_csv(file, sep="\t", index_col=0)
    cts = cts_raw[cts_raw.any(axis=1)]  # 去除全0行
    cts.index = cts.index.str.split('.').str[0]

    cts = cts[ cts.index.isin(df["ensg"]) ]
    cts.index = gtf.loc[cts.index, 'Gene Symbol'] # 转换为基因名 

    cts = cts.groupby(cts.index).sum() # 按照基因ID合并重复行
    min_reads = 10
    cts = cts[ (cts > min_reads).all(axis=1) ] # 过滤掉在所有样本中读数小于min_reads的基因
    cts.index.name, cts.columns.name = 'Gene', 'Sample' # 设置行列名

    print( cts.shape, outlier.shape, outlier["Sample"].isin(cts.columns).sum(), outlier["Gene"].isin(cts.index).all(),)
    # (2) outlier: true outlier gene-sample pair in cts.    
    # sample meta table
    sample_list_interest = cts.columns.tolist()
    outlier_use = outlier.query('Sample in @sample_list_interest').copy()

    ctsfile = f'{datadir}/cts_{tissue_id}_s{cts.shape[1]}_g{cts.shape[0]}.tsv.gz'
    cts.to_csv(ctsfile, sep='\t')
    # outliers: part1 + part2
    outlierfile =f'{datadir}/outlier_{tissue_id}_sg{outlier_use.shape[0]}.tsv'
    outlier_use.to_csv(outlierfile, sep='\t')
    
    print(f'{tissue_name}/{tissue_id}/min_reads={min_reads} stable_ngene={cts.shape[0]} nsample={cts.shape[1]}')

    # print(outlierfile,ctsfile)
    tissues.loc[idx, :] = [ tissue_id, tissue_name, cts.shape[1], cts.shape[0],ctsfile, outlierfile]
    
tissues.to_csv(f'{datadir}/tissues_{use}.tsv', sep='\t')

(12369, 269) (49, 5) 25 True
Fibroblast_Stranded/FBSS/min_reads=10 stable_ngene=12369 nsample=269
(13411, 154) (49, 5) 24 True
Fibroblast_NonStranded/FBNS/min_reads=10 stable_ngene=13411 nsample=154


In [7]:
outlier.shape, tissues.shape

((49, 5), (2, 6))

In [8]:
tissues

Unnamed: 0,TISSUE_ID,TISSUE_NAME,N_SAMPLE,N_GENE,CTS_FILE,OUTLIER_FILE
0,FBSS,Fibroblast_Stranded,269,12369,/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed...,/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed...
1,FBNS,Fibroblast_NonStranded,154,13411,/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed...,/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed...


## prepare folders and config file

In [9]:
# 准备结果目录和流程所需config文件
workdir = '/mnt/disk7t/xwj/axolotl_rev/'

# level 1
output_path = f'{workdir}/result/dataset_pfib_423_split'
# level 2
samples_path = f'{output_path}/samples'
task_config_path = f'{output_path}/task_config'
task_output_path = f'{output_path}/task_output'
metric_output_path = f'{output_path}/metric'

# print(workdir, samples_path, task_config_path, task_output_path, metric_output_path)
os.system(f'mkdir -p {samples_path} {task_config_path} {task_output_path} {metric_output_path}')
# os.system(f'chmod --silent -R 777 {task_output_path}')?

0

In [None]:
# prepare task config file.
# task_config table have columns indicating output filenames of different methods

for i, row in tissues.iterrows():
    
    t  = row.TISSUE_ID
    tn = row.TISSUE_NAME
    ns = row.N_SAMPLE
    ng = row.N_GENE
    ctsfile = row.CTS_FILE
    outlierfile = row.OUTLIER_FILE

    prefix = f't{i:02d}_{t}_s{ns}_g{ng}' # id, number of samples, number of genes.
    
    cols = ['Dname','cts','samples','MyMethod','OUTRIDER','ABEILLE','OUTSINGLE']
    tasks = [0] # list of parallel tasks
    task_config = pd.DataFrame(index=tasks, columns=cols)
    task_config.index.name = 'task'

    task = 0 # 默认是全部样本, 所以only one task for this tissue  
    task_config.loc[task, 'Dname' ] = t
    task_config.loc[task, 'cts' ] = ctsfile
    task_config.loc[task, 'samples' ] = f'{samples_path}/{prefix}.txt'
    # create filenames
    task_config.loc[task, 'MyMethod'] = f'{task_output_path}/{prefix}/{task:03d}_mymethod.txt.gz'
    task_config.loc[task, 'OUTRIDER'] = f'{task_output_path}/{prefix}/{task:03d}_outrider.txt.gz'
    task_config.loc[task, 'OUTSINGLE'] = f'{task_output_path}/{prefix}/{task:03d}_outsingle.txt.gz'
    task_config.loc[task, 'ABEILLE'] = f'{task_output_path}/{prefix}/{task:03d}_abeille.txt.gz'

    # 0. create config & output folder of parallel tasks
    task_config.to_csv(f'{task_config_path}/{prefix}.config',sep='\t')
    os.system(f'mkdir -p {task_output_path}/{prefix}')
    
    # 1. sample ids of parallel tasks
    # 将task specific样本列表作为one row添加到DataFrame中. 默认task是全部样本
    cts = pd.read_csv(ctsfile, sep='\t',index_col=0)
    all_samples_df = pd.DataFrame(data=cts.columns.T.tolist()).transpose()
    all_samples_df.index = task_config.index
    all_samples_df.to_csv(f'{samples_path}/{prefix}.txt',sep='\t')
    
    # 2. true outliers for parallel tasks. No down-sampling, this file is used by a set of tasks.
    outlier = pd.read_csv(outlierfile, sep='\t', index_col=0)
    # 默认task是全部样本
    outlier.to_csv(f'{samples_path}/{prefix}_outliers.txt',sep='\t')

## subset to small sizes

In [None]:
# 对ns和 ss 分别抽样
# 定义样本量列表和随机种子
sample_sizes = [100, 60, 30, 10]
n_replicates = 10  # 每个样本量重复10次
random_seeds = range(n_replicates)  # 使用0-9作为随机种子

# 从tissues中获取信息
for i, tissue_data in tissues.iterrows():
    # tissue_data = tissues.iloc[0]
    t = tissue_data['TISSUE_ID']  # TISSUE_ID
    tn = tissue_data['TISSUE_NAME']  # TISSUE_NAME
    ns = tissue_data['N_SAMPLE']  # N_SAMPLE
    ng = tissue_data['N_GENE']  # N_GENE
    ctsfile = tissue_data['CTS_FILE']  # CTS_FILE
    outlierfile = tissue_data['OUTLIER_FILE']  # OUTLIER_FILE

    # 创建基础前缀
    base_prefix = f't{i:02d}_{t}_s{ns}_g{ng}' # id, number of samples, number of genes.
    
    cts = pd.read_csv(ctsfile, sep='\t', index_col=0)
    all_samples = cts.columns.tolist()
    print(i, t, tn, cts.shape, len(all_samples))
    
    # positive samples
    outlier = pd.read_csv(outlierfile, sep='\t', index_col=0)
    assert outlier['Sample'].size == len(set(outlier['Sample']))
    postive_samples = outlier['Sample']
    
    # all negative samples
    negative_samples = list(set(all_samples) - set(postive_samples) )
    
    # 创建字典存储所有抽样结果
    sampled_configs = {}

    # 对每个样本量进行抽样
    for size in sample_sizes:
        # 存储当前样本量的所有抽样结果
        size_configs = {}
        
        for seed in random_seeds:
            if size <= len(postive_samples):
                part1_sampled = postive_samples.sample(n = size, random_state=seed).tolist()
                n_negative = 0
                negative_selected = []  
            else:
                part1_sampled = postive_samples.tolist()
                n_negative = size - len(part1_sampled)
                # 从阴性样本中随机抽样（数量 = 总样本量 - 阳性样本数量
                # negative_selected =  np.random.choice(negative_samples, size=n_negative, replace=False)
                negative_selected = pd.Series(negative_samples).sample(n = n_negative, random_state=seed).tolist() #, size=n_negative, replace=False)

            # 合并阳性样本和随机选择的阴性样本
            selected_samples =  part1_sampled +  negative_selected
            assert len(selected_samples) == len(set(selected_samples))
            # print(i,t, size, seed, len(selected_samples),  len(part1_sampled), part1_sampled[:3], n_negative, negative_selected[:3], )
            
            # 创建前缀
            prefix = f'{base_prefix}_size{size}_seed{seed}'
            
            # 创建输出目录
            output_dir = os.path.join(task_output_path, prefix)
            os.makedirs(output_dir, exist_ok=True)
            
            # 创建样本列表文件
            samples_file = os.path.join(samples_path, f'{prefix}.txt')
            samples = pd.DataFrame(selected_samples).transpose()
            samples.index.name = 'task'
            samples.to_csv(samples_file, sep='\t', header=True)
            
            # 创建config文件
            cols = ['Dname', 'cts', 'samples', 'MyMethod', 'OUTRIDER', 'ABEILLE', 'OUTSINGLE']
            task_config = pd.DataFrame(columns=cols)
            task_config.index.name = 'task'
            # 填充config数据
            task = 0
            task_config.loc[0, 'Dname'] = t
            task_config.loc[0, 'cts'] = ctsfile
            task_config.loc[0, 'samples'] = samples_file
            task_config.loc[0, 'MyMethod'] = os.path.join(output_dir, f'{task:03d}_mymethod.txt.gz')
            task_config.loc[0, 'OUTRIDER'] = os.path.join(output_dir, f'{task:03d}_outrider.txt.gz')
            task_config.loc[0, 'OUTSINGLE'] = os.path.join(output_dir, f'{task:03d}_outsingle.txt.gz')
            task_config.loc[0, 'ABEILLE'] = os.path.join(output_dir, f'{task:03d}_abeille.txt.gz')
            # {prefix}/{task:03d}
            
            # 保存config文件
            config_file = os.path.join(task_config_path, f'{prefix}.config')
            task_config.to_csv(config_file, sep='\t', index=False)
            
            # 存储结果
            size_configs[f'seed_{seed}'] = {
                'config': task_config,
                'selected_samples': selected_samples,
                'output_dir': output_dir
            }
        
        # 将当前样本量的所有结果存入总字典
        sampled_configs[f'size_{size}'] = size_configs

    # sampled_configs现在包含了所有抽样结果和对应的配置文件
    # 结构为：{sample_size: {seed: {'config': df, 'selected_samples': list, 'output_dir': str}}, ...}
    output_file = os.path.join(task_config_path, f'{base_prefix}_sampled_configs_dict.pkl')
    # 保存sampled_configs字典
    import pickle
    with open(output_file, 'wb') as f:
        pickle.dump(sampled_configs, f)

    print(f"sampled_configs已保存到: {output_file}")        


0 FBSS Fibroblast_Stranded (12369, 269) 269
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t00_FBSS_s269_g12369_sampled_configs_dict.pkl
1 FBNS Fibroblast_NonStranded (13411, 154) 154
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t01_FBNS_s154_g13411_sampled_configs_dict.pkl


In [None]:
# 定义样本量列表和随机种子
list_outlier_pct = [ 0.04, 0.08, 0.16, 0.24,]
sample_sizes = [100,]

n_replicates = 10  # 每个样本量重复10次
random_seeds = range(n_replicates)  # 使用0-9作为随机种子

# 从tissues中获取信息
for i, tissue_data in tissues.iterrows():
    # tissue_data = tissues.iloc[0]
    t = tissue_data['TISSUE_ID']  # TISSUE_ID
    tn = tissue_data['TISSUE_NAME']  # TISSUE_NAME
    ns = tissue_data['N_SAMPLE']  # N_SAMPLE
    ng = tissue_data['N_GENE']  # N_GENE
    ctsfile = tissue_data['CTS_FILE']  # CTS_FILE
    outlierfile = tissue_data['OUTLIER_FILE']  # OUTLIER_FILE
    
    cts = pd.read_csv(ctsfile, sep='\t', index_col=0)
    all_samples = cts.columns.tolist()
    print(i, t, tn, cts.shape, len(all_samples))
    
    # positive samples
    outlier = pd.read_csv(outlierfile, sep='\t', index_col=0)
    assert outlier['Sample'].size == len(set(outlier['Sample']))
    postive_samples = outlier['Sample']
    
    # all negative samples
    negative_samples = list(set(all_samples) - set(postive_samples) )
    
    for pct in list_outlier_pct: # 增加pct的变化
        # 创建基础前缀
        base_prefix = f't{i:02d}_{t}_s{ns}_g{ng}_pct{pct:.2f}'
        print(base_prefix)
        # 创建字典存储所有抽样结果
        sampled_configs = {}
        # 对每个样本量进行抽样
        for size in sample_sizes:
            # 存储当前样本量的所有抽样结果
            size_configs = {}
            n_positive = int(size * pct)
            
            for seed in random_seeds:
                part1_sampled = postive_samples.sample(n = n_positive, random_state=seed*1000+n_positive).tolist()
                n_negative = size - len(part1_sampled)
                # 从阴性样本中随机抽样（数量 = 总样本量 - 阳性样本数量
                # negative_selected =  np.random.choice(negative_samples, size=n_negative, replace=False)
                negative_selected = pd.Series(negative_samples).sample(n = n_negative, random_state=seed*2000+n_negative).tolist() #, size=n_negative, replace=False)

                # 合并阳性样本和随机选择的阴性样本
                selected_samples =  part1_sampled +  negative_selected
                assert len(selected_samples) == len(set(selected_samples))
                # print(i,t, size, pct, seed, len(selected_samples),  len(part1_sampled), part1_sampled[:3], n_negative, negative_selected[:3], )
                
                # 创建前缀 + size + pct + seed
                prefix = f'{base_prefix}_size{size}_seed{seed}'
                
                # 创建输出目录
                output_dir = os.path.join(task_output_path, prefix)
                os.makedirs(output_dir, exist_ok=True)
                
                # 创建样本列表文件
                samples_file = os.path.join(samples_path, f'{prefix}.txt')
                samples = pd.DataFrame(selected_samples).transpose()
                samples.index.name = 'task'
                samples.to_csv(samples_file, sep='\t', header=True)
                
                # 创建config文件
                cols = ['Dname', 'cts', 'samples', 'MyMethod', 'OUTRIDER', 'ABEILLE', 'OUTSINGLE']
                task_config = pd.DataFrame(columns=cols)
                task_config.index.name = 'task'
                # 填充config数据
                task = 0
                task_config.loc[0, 'Dname'] = t
                task_config.loc[0, 'cts'] = ctsfile
                task_config.loc[0, 'samples'] = samples_file
                task_config.loc[0, 'MyMethod'] = os.path.join(output_dir, f'{task:03d}_mymethod.txt.gz')
                task_config.loc[0, 'OUTRIDER'] = os.path.join(output_dir, f'{task:03d}_outrider.txt.gz')
                task_config.loc[0, 'OUTSINGLE'] = os.path.join(output_dir, f'{task:03d}_outsingle.txt.gz')
                task_config.loc[0, 'ABEILLE'] = os.path.join(output_dir, f'{task:03d}_abeille.txt.gz')
                # {prefix}/{task:03d}
                
                # 保存config文件
                config_file = os.path.join(task_config_path, f'{prefix}.config')
                task_config.to_csv(config_file, sep='\t', index=False)
                
                # 存储结果
                size_configs[f'seed_{seed}'] = {
                    'config': task_config,
                    'selected_samples': selected_samples,
                    'output_dir': output_dir
                }
            
            # 将当前样本量的所有结果存入总字典
            sampled_configs[f'size_{size}'] = size_configs

        # sampled_configs现在包含了所有抽样结果和对应的配置文件
        # 结构为：{sample_size: {seed: {'config': df, 'selected_samples': list, 'output_dir': str}}, ...}
        output_file = os.path.join(task_config_path, f'{base_prefix}_sampled_configs_dict.pkl')
        # 保存sampled_configs字典
        import pickle
        with open(output_file, 'wb') as f:
            pickle.dump(sampled_configs, f)

        print(f"sampled_configs已保存到: {output_file}")      

0 FBSS Fibroblast_Stranded (12369, 269) 269
t00_FBSS_s269_g12369_pct0.04
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t00_FBSS_s269_g12369_pct0.04_sampled_configs_dict.pkl
t00_FBSS_s269_g12369_pct0.08
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t00_FBSS_s269_g12369_pct0.08_sampled_configs_dict.pkl
t00_FBSS_s269_g12369_pct0.16
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t00_FBSS_s269_g12369_pct0.16_sampled_configs_dict.pkl
t00_FBSS_s269_g12369_pct0.24
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t00_FBSS_s269_g12369_pct0.24_sampled_configs_dict.pkl
1 FBNS Fibroblast_NonStranded (13411, 154) 154
t01_FBNS_s154_g13411_pct0.04
sampled_configs已保存到: /mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_config/t01_FBNS_s154_g13411_pct0.04_sampled_configs_dict.pkl
t01_FBNS_s154_g13411_pct0.08
sampl

In [None]:
list_outlier_pct = [ 0.04, 0.08, 0.16, 0.24, 0.32, 0.40 ]
sample_sizes = [50,]

n_replicates = 10  # 每个样本量重复10次
random_seeds = range(n_replicates)  # 使用0-9作为随机种子

# 从tissues中获取信息
for i, tissue_data in tissues.iterrows():
    # tissue_data = tissues.iloc[0]
    t = tissue_data['TISSUE_ID']  # TISSUE_ID
    tn = tissue_data['TISSUE_NAME']  # TISSUE_NAME
    ns = tissue_data['N_SAMPLE']  # N_SAMPLE
    ng = tissue_data['N_GENE']  # N_GENE
    ctsfile = tissue_data['CTS_FILE']  # CTS_FILE
    outlierfile = tissue_data['OUTLIER_FILE']  # OUTLIER_FILE
    
    cts = pd.read_csv(ctsfile, sep='\t', index_col=0)
    all_samples = cts.columns.tolist()
    print(i, t, tn, cts.shape, len(all_samples))
    
    # positive samples
    outlier = pd.read_csv(outlierfile, sep='\t', index_col=0)
    assert outlier['Sample'].size == len(set(outlier['Sample']))
    postive_samples = outlier['Sample']
    
    # all negative samples
    negative_samples = list(set(all_samples) - set(postive_samples) )
    
    for pct in list_outlier_pct: # 增加pct的变化
        # 创建基础前缀
        base_prefix = f't{i:02d}_{t}_s{ns}_g{ng}_pct{pct:.2f}'
        print(base_prefix)
        # 创建字典存储所有抽样结果
        sampled_configs = {}
        # 对每个样本量进行抽样
        for size in sample_sizes:
            # 存储当前样本量的所有抽样结果
            size_configs = {}
            n_positive = int(size * pct)
            
            for seed in random_seeds:
                part1_sampled = postive_samples.sample(n = n_positive, random_state=seed*1000+n_positive).tolist()
                n_negative = size - len(part1_sampled)
                # 从阴性样本中随机抽样（数量 = 总样本量 - 阳性样本数量
                # negative_selected =  np.random.choice(negative_samples, size=n_negative, replace=False)
                negative_selected = pd.Series(negative_samples).sample(n = n_negative, random_state=seed*2000+n_negative).tolist() #, size=n_negative, replace=False)

                # 合并阳性样本和随机选择的阴性样本
                selected_samples =  part1_sampled +  negative_selected
                assert len(selected_samples) == len(set(selected_samples))
                # print(i,t, size, pct, seed, len(selected_samples),  len(part1_sampled), part1_sampled[:3], n_negative, negative_selected[:3], )
                
                # 创建前缀 + size + pct + seed
                prefix = f'{base_prefix}_size{size}_seed{seed}'
                
                # 创建输出目录
                output_dir = os.path.join(task_output_path, prefix)
                os.makedirs(output_dir, exist_ok=True)
                
                # 创建样本列表文件
                samples_file = os.path.join(samples_path, f'{prefix}.txt')
                samples = pd.DataFrame(selected_samples).transpose()
                samples.index.name = 'task'
                samples.to_csv(samples_file, sep='\t', header=True)
                
                # 创建config文件
                cols = ['Dname', 'cts', 'samples', 'MyMethod', 'OUTRIDER', 'ABEILLE', 'OUTSINGLE']
                task_config = pd.DataFrame(columns=cols)
                task_config.index.name = 'task'
                # 填充config数据
                task = 0
                task_config.loc[0, 'Dname'] = t
                task_config.loc[0, 'cts'] = ctsfile
                task_config.loc[0, 'samples'] = samples_file
                task_config.loc[0, 'MyMethod'] = os.path.join(output_dir, f'{task:03d}_mymethod.txt.gz')
                task_config.loc[0, 'OUTRIDER'] = os.path.join(output_dir, f'{task:03d}_outrider.txt.gz')
                task_config.loc[0, 'OUTSINGLE'] = os.path.join(output_dir, f'{task:03d}_outsingle.txt.gz')
                task_config.loc[0, 'ABEILLE'] = os.path.join(output_dir, f'{task:03d}_abeille.txt.gz')
                # {prefix}/{task:03d}
                
                # 保存config文件
                config_file = os.path.join(task_config_path, f'{prefix}.config')
                task_config.to_csv(config_file, sep='\t', index=False)
                
                # 存储结果
                size_configs[f'seed_{seed}'] = {
                    'config': task_config,
                    'selected_samples': selected_samples,
                    'output_dir': output_dir
                }
            
            # 将当前样本量的所有结果存入总字典
            sampled_configs[f'size_{size}'] = size_configs

        # sampled_configs现在包含了所有抽样结果和对应的配置文件
        # 结构为：{sample_size: {seed: {'config': df, 'selected_samples': list, 'output_dir': str}}, ...}
        output_file = os.path.join(task_config_path, f'{base_prefix}_sampled_configs_dict.pkl')
        # 保存sampled_configs字典
        import pickle
        with open(output_file, 'wb') as f:
            pickle.dump(sampled_configs, f)

        print(f"sampled_configs已保存到: {output_file}")      

In [121]:
sampled_configs['size_100']['seed_1']['config'].values

array([['FBNS',
        '/mnt/disk7t/xwj/axolotl_rev/pfib_423_processed//cts_FBNS_s154_g13411.tsv.gz',
        '/mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/samples/t01_FBNS_s154_g13411_pct0.24_size100_seed1.txt',
        '/mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_output/t01_FBNS_s154_g13411_pct0.24_size100_seed1/000_mymethod.txt.gz',
        '/mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_output/t01_FBNS_s154_g13411_pct0.24_size100_seed1/000_outrider.txt.gz',
        '/mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_output/t01_FBNS_s154_g13411_pct0.24_size100_seed1/000_abeille.txt.gz',
        '/mnt/disk7t/xwj/axolotl_rev//result/dataset_pfib_423_split/task_output/t01_FBNS_s154_g13411_pct0.24_size100_seed1/000_outsingle.txt.gz']],
      dtype=object)