In [1]:
%config Completer.use_jedi = False
import sys, IPython
print("IPython version:", IPython.__version__)
import jedi
print("Jedi version:", jedi.__version__)

IPython version: 8.37.0
Jedi version: 0.19.2


In [2]:
# -- 数据处理 --
import scanpy as sc
import pandas as pd
import anndata as ad
import numpy as np
import scipy
import matplotlib
import seaborn as sns

# -- Pytorch --
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# -- 辅助库 --
import os
import math
import random
from tqdm.auto import tqdm
import warnings

# 忽略一些常见的警告
warnings.filterwarnings('ignore', category=UserWarning, module='scanpy')
warnings.filterwarnings('ignore', category=FutureWarning)


print(f"PyTorch 版本: {torch.__version__}")
print(f"Scanpy 版本: {sc.__version__}")

PyTorch 版本: 2.1.2+cu121
Scanpy 版本: 1.9.3


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# DATA2_PATH = "/gpfs/hybrid/data/public/TEDD/link_cells/hs_AD_Brain_Cerebellum[Organoid]_36-a.link.h5ad"
# DATA1_PATH = "/gpfs/hybrid/data/public/PerturBase/drug_perturb.true_time/test/2.preprocess/GSE134839.Erlotinib.pp.h5ad"

DATA1_PATH = "/gpfs/hybrid/data/public/TEDD/link_cells/Tedd.19_Embryo.link.h5ad"

In [None]:




adata = sc.read_h5ad(DATA1_PATH)



adata

AnnData object with n_obs × n_vars = 590238 × 3000
    obs: 'Tissue', 'Celltype', 'Timepoint', 'Sex', 'UMAP_1', 'UMAP_2', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'outlier', 'time', 'prev_cell_id', 'next_cell_id'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'Cell', 'PMID_DOI', 'Project', 'Sequencing', 'Species', 'Stage', 'Technology', 'Timepoint', 'Tissue', 'conversion_rule', 'hvg', 'log1p', 'needs_conversion', 'needs_reference_unification', 'neighbors', 'special_notes', 'time_colors', 'time_unit', 'timepoints', 'umap'
    obsm: 'X_AE', 'X_umap'
    obsp: 'connectivities', 'distances'

In [7]:
adata.layers


Layers with keys: 

In [9]:
log1p_data = adata.layers['log1p']

type(log1p_data)


log1p_data[:5, :20]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 1.9209595 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.3766967 ,
        0.68675894, 0.        , 0.        , 0

In [36]:
max_val = adata.X.max()

min_val = adata.X.min()

max_val, min_val

(434.82715, -1.7225165)

In [32]:
adata.X.dtype


dtype('float32')

In [8]:
unit = adata.uns['time_unit']
unit

'embryonic_days'

In [9]:
timepoints = adata.uns['timepoints']
timepoints

'10.5,11.5,12.5,13.5,9.5'

In [15]:
adata.obs.iloc[0]



Tissue                          cerebral cortex
Celltype                      radial glial cell
Timepoint                                   30y
Sex                                        Male
UMAP1                                 -0.564602
UMAP2                                 14.525732
n_genes_by_counts                          1221
log1p_n_genes_by_counts                7.108244
total_counts                             2707.0
log1p_total_counts                     7.903965
pct_counts_in_top_20_genes            15.145918
outlier                                   False
time                                       30.0
prev_cell_id                              16178
next_cell_id                              13887
Name: homosapiens_None_2020_10x3v2_bhaduriaparna_001_d10_1038_s41586_020_1962_0_5_YH10PWeek8_GTAACGTCAGTCGTGC, dtype: object

In [None]:

# target_cell_name = "homosapiens_None_2020_10x3v2_bhaduriaparna_001_d10_1038_s41586_020_1962_0_5_YH10PWeek8_GTAACGTCAGTCGTGC"

# # adata[行名, :] 会返回这一行的数据视图
# cell_data = adata[target_cell_name, :].X.flatten()

# gene_expression = pd.Series(cell_data, index=adata.var_names)

# top5 = gene_expression.sort_values(ascending=False).head(5)

# print(f"细胞 {target_cell_name} \n表达量最高的 5 个基因是：")
# print("-" * 30)
# print(top5)

#------------------------------

# 注意：加上 .flatten() 是为了确保它变成一维数组，方便 pandas 处理
cell_data = adata.X[0, :].flatten()

# 创建一个 Series：索引是基因名，值是表达量
gene_expression = pd.Series(cell_data, index=adata.var_names)

# 从大到小排序，并取前 5 个
top5_genes = gene_expression.sort_values(ascending=False).head(5)

# 打印结果
print("该细胞表达量最高的 5 个基因是：")
print(top5_genes)

该细胞表达量最高的 5 个基因是：
CP            26.079483
SRGAP3-AS4    23.649893
WDR93         16.228954
ACTA2         12.490084
HCG17         11.473103
dtype: float32


In [20]:
cell_info = adata.obs.iloc[13887]

cell_info

Tissue                                                brain
Celltype                      neuroblast (sensu Vertebrata)
Timepoint                                               36y
Sex                                                  Female
UMAP1                                             -1.789259
UMAP2                                             14.088922
n_genes_by_counts                                      2533
log1p_n_genes_by_counts                            7.837554
total_counts                                         5161.0
log1p_total_counts                                 8.549079
pct_counts_in_top_20_genes                        14.551444
outlier                                               False
time                                                   36.0
prev_cell_id                                          13217
next_cell_id                                          12996
Name: homosapiens_brain_2022_10x3v3_hezhisong_002_d10_1038_s41592_021_01344_8_34505_LTv2_NEd46_iNE46

In [26]:
cell_data_13887 = adata.X[13887, :].flatten()

gene_expression_13887 = pd.Series(cell_data_13887, index=adata.var_names)

top5_genes = gene_expression_13887.sort_values(ascending=False).head(5)

# target_genes = ['CP', 'SRGAP3-AS4', 'WDR93', 'ACTA2', 'HCG17']

# print(gene_expression_13887[target_genes])

print(top5_genes)

TMED6        36.281353
LINC01118    29.933043
PITX2        19.045702
FEZF1        16.725554
GSX2         12.887493
dtype: float32


In [27]:
cell_data_13887 = adata.X[13887, :].flatten()

gene_expression_13887 = pd.Series(cell_data_13887, index=adata.var_names)

top5_genes = gene_expression_13887.sort_values(ascending=False).head(5)

target_genes = ['CP', 'SRGAP3-AS4', 'WDR93', 'ACTA2', 'HCG17']

print(gene_expression_13887[target_genes])


CP           -0.055807
SRGAP3-AS4   -0.040038
WDR93        -0.060839
ACTA2        -0.117915
HCG17        -0.088672
dtype: float32
