In [None]:
'''
下采样数据集，用于统计时间与内存
'''

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
data_path = '/home/henu/work/data/DRA/10x_73k/counts_matrix.csv'
output_dir = "/home/henu/work/data/downsampling/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"目录 '{output_dir}' 创建成功")
else:
    print(f"目录 '{output_dir}' 已存在")

data = pd.read_csv(data_path, index_col=0)
print(data.shape)  # (73233,720)
# 需要采样的细胞数量
cell_sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 30000, 50000, 73233]

# 固定随机种子，保证可复现
np.random.seed(123)

for n_cells in cell_sizes:
    if n_cells > data.shape[0]:
        print(f"跳过 {n_cells} 个细胞，因为原数据只有 {data.shape[0]} 个细胞")
        continue
    # 采样
    sampled_data = data.sample(n=n_cells, replace=False, random_state=123)
    # 重命名行列索引
    sampled_data.index = [f"Cell{i+1}" for i in range(sampled_data.shape[0])]
    sampled_data.columns = [f"Gene{j+1}" for j in range(sampled_data.shape[1])]
    # 保存采样结果
    output_path = os.path.join(output_dir, f"cell_{n_cells}.csv")
    sampled_data.to_csv(output_path, index=True)