In [1]:
import scvelo as scv
import pandas as pd

In [2]:
adata = scv.read("../01_Input_file/ERPik_raw.loom", cache=False)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [3]:
adata.var_names_make_unique()   # fix duplicate gene names

In [4]:
adata.obs

possorted_genome_bam_W83LZ:AAAGGATTCAGCCTTCx
possorted_genome_bam_W83LZ:AAACGAACAGCTGTCGx
possorted_genome_bam_W83LZ:AAAGAACCATCTTTCAx
possorted_genome_bam_W83LZ:AAACGAAGTCATATGCx
possorted_genome_bam_W83LZ:AAACCCACATCCCACTx
...
possorted_genome_bam_W83LZ:TTTGGTTGTCGCACGTx
possorted_genome_bam_W83LZ:TTTGTTGAGCCTCAGCx
possorted_genome_bam_W83LZ:TTTGGTTCATCGATACx
possorted_genome_bam_W83LZ:TTTGTTGAGCCATGCCx
possorted_genome_bam_W83LZ:TTTGGAGGTTGGGTTTx


In [6]:
cell_meta = pd.read_csv("meta_ER_Pik.csv")
cell_umap = pd.read_csv("umap_ER_Pik.csv", sep=',')
cell_meta_umap = pd.merge(cell_meta, cell_umap, on="CellID", how="inner")

cell_meta_umap

Unnamed: 0,CellID,cell_type,umap_1,umap_2
0,AAACCCAAGGTTACAA-1_3,Myo,-7.883332,-8.220748
1,AAACCCACACCCAAGC-1_3,Late_HY,-7.431042,0.378131
2,AAACCCACACCTGCGA-1_3,Late_HY,-6.337716,2.035150
3,AAACCCACATCCCACT-1_3,Late_HY,-6.760766,1.335699
4,AAACGAAAGCGGCTCT-1_3,Late_HY,-6.763875,2.675161
...,...,...,...,...
8664,TTTGTTGCATCGAAGG-1_3,LC_ER+_Sca1,-1.967123,-7.662631
8665,TTTGTTGGTTAGAGTA-1_3,Late_HY,-5.430854,1.680624
8666,TTTGTTGTCCTTCTGG-1_3,Late_HY,-7.170263,3.242602
8667,TTTGTTGTCGCAAGAG-1_3,LC_ER+_Foxa1,1.734159,-7.383719


In [7]:
cell_meta_umap.index = cell_meta_umap['CellID']
cell_meta_umap = cell_meta_umap.drop(['CellID'], axis=1)

cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAACCCAAGGTTACAA-1_3,Myo,-7.883332,-8.220748
AAACCCACACCCAAGC-1_3,Late_HY,-7.431042,0.378131
AAACCCACACCTGCGA-1_3,Late_HY,-6.337716,2.035150
AAACCCACATCCCACT-1_3,Late_HY,-6.760766,1.335699
AAACGAAAGCGGCTCT-1_3,Late_HY,-6.763875,2.675161
...,...,...,...
TTTGTTGCATCGAAGG-1_3,LC_ER+_Sca1,-1.967123,-7.662631
TTTGTTGGTTAGAGTA-1_3,Late_HY,-5.430854,1.680624
TTTGTTGTCCTTCTGG-1_3,Late_HY,-7.170263,3.242602
TTTGTTGTCGCAAGAG-1_3,LC_ER+_Foxa1,1.734159,-7.383719


In [8]:
# loop for cellID

def add_scvelo_ID(df):
    df = df.copy()
    ids = []
    for idx in df.index.astype(str):
        barcode16 = idx.split("-")[0][:16]
        ids.append(f"possorted_genome_bam_W83LZ:{barcode16}x")
    df["scvelo_cellID"] = ids
    return df

In [9]:
new_cell_meta_umap = add_scvelo_ID(cell_meta_umap)
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2,scvelo_cellID
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCCAAGGTTACAA-1_3,Myo,-7.883332,-8.220748,possorted_genome_bam_W83LZ:AAACCCAAGGTTACAAx
AAACCCACACCCAAGC-1_3,Late_HY,-7.431042,0.378131,possorted_genome_bam_W83LZ:AAACCCACACCCAAGCx
AAACCCACACCTGCGA-1_3,Late_HY,-6.337716,2.035150,possorted_genome_bam_W83LZ:AAACCCACACCTGCGAx
AAACCCACATCCCACT-1_3,Late_HY,-6.760766,1.335699,possorted_genome_bam_W83LZ:AAACCCACATCCCACTx
AAACGAAAGCGGCTCT-1_3,Late_HY,-6.763875,2.675161,possorted_genome_bam_W83LZ:AAACGAAAGCGGCTCTx
...,...,...,...,...
TTTGTTGCATCGAAGG-1_3,LC_ER+_Sca1,-1.967123,-7.662631,possorted_genome_bam_W83LZ:TTTGTTGCATCGAAGGx
TTTGTTGGTTAGAGTA-1_3,Late_HY,-5.430854,1.680624,possorted_genome_bam_W83LZ:TTTGTTGGTTAGAGTAx
TTTGTTGTCCTTCTGG-1_3,Late_HY,-7.170263,3.242602,possorted_genome_bam_W83LZ:TTTGTTGTCCTTCTGGx
TTTGTTGTCGCAAGAG-1_3,LC_ER+_Foxa1,1.734159,-7.383719,possorted_genome_bam_W83LZ:TTTGTTGTCGCAAGAGx


In [10]:
valid_cells = new_cell_meta_umap["scvelo_cellID"].values
adata_sub = adata[adata.obs_names.isin(valid_cells)].copy()

In [11]:
print(f"Original cells: {adata.n_obs}, Filtered cells: {adata_sub.n_obs}")

Original cells: 20872, Filtered cells: 8669


In [12]:
new_cell_meta_umap.index = new_cell_meta_umap['scvelo_cellID']
new_cell_meta_umap = new_cell_meta_umap.drop(['scvelo_cellID'], axis=1)

In [13]:
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
scvelo_cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
possorted_genome_bam_W83LZ:AAACCCAAGGTTACAAx,Myo,-7.883332,-8.220748
possorted_genome_bam_W83LZ:AAACCCACACCCAAGCx,Late_HY,-7.431042,0.378131
possorted_genome_bam_W83LZ:AAACCCACACCTGCGAx,Late_HY,-6.337716,2.035150
possorted_genome_bam_W83LZ:AAACCCACATCCCACTx,Late_HY,-6.760766,1.335699
possorted_genome_bam_W83LZ:AAACGAAAGCGGCTCTx,Late_HY,-6.763875,2.675161
...,...,...,...
possorted_genome_bam_W83LZ:TTTGTTGCATCGAAGGx,LC_ER+_Sca1,-1.967123,-7.662631
possorted_genome_bam_W83LZ:TTTGTTGGTTAGAGTAx,Late_HY,-5.430854,1.680624
possorted_genome_bam_W83LZ:TTTGTTGTCCTTCTGGx,Late_HY,-7.170263,3.242602
possorted_genome_bam_W83LZ:TTTGTTGTCGCAAGAGx,LC_ER+_Foxa1,1.734159,-7.383719


In [14]:
adata_sub.obs = adata_sub.obs.join(new_cell_meta_umap, how="left")

In [15]:
print(adata_sub.obs.head())

                                                cell_type    umap_1    umap_2
CellID                                                                       
possorted_genome_bam_W83LZ:AAAGGATTCAGCCTTCx      Late_HY -7.179417  3.192922
possorted_genome_bam_W83LZ:AAAGAACCATCTTTCAx  LC_ER+_Sca1 -0.580667 -8.247224
possorted_genome_bam_W83LZ:AAACGAAGTCATATGCx     Early_HY -1.798325 -4.692605
possorted_genome_bam_W83LZ:AAACCCACATCCCACTx      Late_HY -6.760766  1.335699
possorted_genome_bam_W83LZ:AAAGGATTCAGCGCACx  LC_ER+_Sca1 -0.561108 -6.228294


In [16]:
adata_sub.write_loom("filtered_ER_Pik.loom")