In [1]:
%config Completer.use_jedi = False
import sys, IPython
print("IPython version:", IPython.__version__)
import jedi
print("Jedi version:", jedi.__version__)

IPython version: 9.8.0
Jedi version: 0.19.2


In [2]:
import scanpy as sc
import pandas as pd
import anndata as ad
import numpy as np
import scipy
import matplotlib
import seaborn as sns

In [3]:
DATA_PATH = "/gpfs/hybrid/data/downloads/gcloud/arc-scbasecount/2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/SRX24585612.h5ad"


In [4]:
adata = sc.read_h5ad(DATA_PATH)

adata

AnnData object with n_obs × n_vars = 2322 × 36601
    obs: 'gene_count', 'umi_count', 'SRX_accession'
    var: 'gene_symbols', 'feature_types'

In [5]:
adata.X.min(), adata.X.max()

(np.float32(0.0), np.float32(7144.0))

In [6]:
import numpy as np
import scipy.sparse as sp

In [14]:
cell_index = 2250
cell_expression = adata.X[cell_index]

In [15]:
if scipy.sparse.issparse(cell_expression):
    cell_expression = cell_expression.toarray()

zero_count = np.sum(cell_expression == 0)

zero_count

np.int64(35234)

In [16]:
import numpy as np
import scipy.sparse


# 1. 获取总基因数 (Total Genes)
total_genes = adata.n_vars

# 2. 计算每个细胞的“非零”基因个数
if scipy.sparse.issparse(adata.X):
     # .getnnz(axis=1) 极速统计每一行的非零个数
     # 结果是一个数组，包含每个细胞的非零基因数
     non_zero_counts = adata.X.getnnz(axis=1)
else:
     # 如果是稠密矩阵 (比较少见)，用 numpy 统计
     non_zero_counts = np.count_nonzero(adata.X, axis=1)

# 3. 计算“非零”的平均值
avg_non_zero = np.mean(non_zero_counts)

# 4. 做减法得到“零”的平均值
avg_zero = total_genes - avg_non_zero

print(f"总基因数: {total_genes}")
print(f"平均每个细胞检测到的基因数 (非0): {avg_non_zero:.2f}")
print(f"平均每个细胞的 0 值基因数: {avg_zero:.2f}")

# --- 额外赠送：看一眼整个矩阵有多“空” (稀疏度) ---
sparsity = avg_zero / total_genes * 100
print(f"数据集整体稀疏度: {sparsity:.2f}% (即平均有 {sparsity:.2f}% 的位置是 0)")

总基因数: 36601
平均每个细胞检测到的基因数 (非0): 1313.84
平均每个细胞的 0 值基因数: 35287.16
数据集整体稀疏度: 96.41% (即平均有 96.41% 的位置是 0)


In [None]:
import pandas as pd
import numpy as np

# 1. 读取目标基因列表 (Gene Order)
# header=None 因为看起来文件第一行就是基因名
target_genes = pd.read_csv('../../data/assets/gene_order.tsv', sep='\t', header=None)[0].values
print(f"目标基因列表长度: {len(target_genes)}")


# 2. 找出“既在目标列表中，又在 adata 中”的基因
# 假设 adata.var['gene_symbols'] 存储了基因名
# 如果 adata.var_names 就是基因名，请把下面换成 adata.var_names
available_mask = adata.var['gene_symbols'].isin(target_genes)

# 3. 仅提取这些共有的基因
# 这一步操作非常快，只是切片，不复制数据
adata_subset = adata[:, available_mask]

# 4. 计算对齐后的稀疏度
# 逻辑：缺失的基因全是 0，所以非零值总数 = subset 的非零值总数
total_non_zeros = adata_subset.X.getnnz()
  
# 理论上的总元素数 = 细胞数 * 目标基因数
# 注意：这里用的是 len(target_genes)，而不是 adata_subset.n_vars
total_elements_aligned = adata.n_obs * len(target_genes)

# 计算稀疏度 (0 的比例)
sparsity_aligned = 1.0 - (total_non_zeros / total_elements_aligned)
avg_zeros_aligned = sparsity_aligned * len(target_genes)

print(f"--- 对齐统计 ---")
print(f"共有基因数: {adata_subset.n_vars} (在 {len(target_genes)} 个目标基因中)")
print(f"对齐后的稀疏度: {sparsity_aligned:.2%}")
print(f"平均每个细胞的 0 值基因数 (对齐后): {avg_zeros_aligned:.1f}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/assets/gene_order.tsv'