In [1]:
import scvelo as scv
import pandas as pd

# Filteration

## CTL

In [2]:
adata = scv.read("../01_Input_file/CTL_raw.loom", cache=False)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [3]:
adata

AnnData object with n_obs × n_vars = 10572 × 32285
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand'
    layers: 'matrix', 'ambiguous', 'spliced', 'unspliced'

In [4]:
adata.var_names_make_unique()   # fix duplicate gene names

In [8]:
adata.obs

possorted_genome_bam_F2ZHW:AAACGCTAGCGTATGGx
possorted_genome_bam_F2ZHW:AAAGTGAAGACATACAx
possorted_genome_bam_F2ZHW:AAATGGACATTGCCGGx
possorted_genome_bam_F2ZHW:AAACGAAGTGATTCTGx
possorted_genome_bam_F2ZHW:AAACGAAGTCATCACAx
...
possorted_genome_bam_F2ZHW:TTTCCTCAGCAAACATx
possorted_genome_bam_F2ZHW:TTTCGATGTGTGTTTGx
possorted_genome_bam_F2ZHW:TTTGGAGCAGAGTAATx
possorted_genome_bam_F2ZHW:TTTCATGGTGCCTAATx
possorted_genome_bam_F2ZHW:TTTCGATAGGCAGGGAx


In [5]:
cell_meta = pd.read_csv("meta_ctl.csv")
cell_meta

Unnamed: 0,CellID,cell_type
0,AAACGAATCAAGTGGG-1_1,LC_ER-_LuP
1,AAAGAACTCTAAACGC-1_1,Myo
2,AAAGGGCTCTGTCAGA-1_1,LC_ER-_LuP
3,AAAGTCCAGTGCTCGC-1_1,LC_ER+_Foxa1
4,AAAGTCCCACTGGACC-1_1,LC_ER-_LuP
...,...,...
1347,TTTGGAGTCTCTCAAT-1_1,LC_ER-_LuP
1348,TTTGGTTCACACCGCA-1_1,Myo
1349,TTTGGTTGTCACGACC-1_1,LC_ER-_LuP
1350,TTTGGTTGTGTACGCC-1_1,LC_ER-_LuP


In [6]:
cell_umap = pd.read_csv("umap_ctl.csv", sep=',')
cell_umap

Unnamed: 0,CellID,umap_1,umap_2
0,AAACGAATCAAGTGGG-1_1,7.519389,6.254953
1,AAAGAACTCTAAACGC-1_1,-7.574637,-7.710628
2,AAAGGGCTCTGTCAGA-1_1,2.844351,3.001735
3,AAAGTCCAGTGCTCGC-1_1,1.854609,-7.134219
4,AAAGTCCCACTGGACC-1_1,7.263851,6.731787
...,...,...,...
1347,TTTGGAGTCTCTCAAT-1_1,2.397697,3.261179
1348,TTTGGTTCACACCGCA-1_1,-8.270704,-8.207389
1349,TTTGGTTGTCACGACC-1_1,2.562882,3.069721
1350,TTTGGTTGTGTACGCC-1_1,6.640626,7.261545


In [7]:
cell_meta_umap = pd.merge(cell_meta, cell_umap, on="CellID", how="inner")
cell_meta_umap

Unnamed: 0,CellID,cell_type,umap_1,umap_2
0,AAACGAATCAAGTGGG-1_1,LC_ER-_LuP,7.519389,6.254953
1,AAAGAACTCTAAACGC-1_1,Myo,-7.574637,-7.710628
2,AAAGGGCTCTGTCAGA-1_1,LC_ER-_LuP,2.844351,3.001735
3,AAAGTCCAGTGCTCGC-1_1,LC_ER+_Foxa1,1.854609,-7.134219
4,AAAGTCCCACTGGACC-1_1,LC_ER-_LuP,7.263851,6.731787
...,...,...,...,...
1347,TTTGGAGTCTCTCAAT-1_1,LC_ER-_LuP,2.397697,3.261179
1348,TTTGGTTCACACCGCA-1_1,Myo,-8.270704,-8.207389
1349,TTTGGTTGTCACGACC-1_1,LC_ER-_LuP,2.562882,3.069721
1350,TTTGGTTGTGTACGCC-1_1,LC_ER-_LuP,6.640626,7.261545


In [11]:
cell_meta_umap.index = cell_meta_umap['CellID']
cell_meta_umap = cell_meta_umap.drop(['CellID'], axis=1)

cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAACGAATCAAGTGGG-1_1,LC_ER-_LuP,7.519389,6.254953
AAAGAACTCTAAACGC-1_1,Myo,-7.574637,-7.710628
AAAGGGCTCTGTCAGA-1_1,LC_ER-_LuP,2.844351,3.001735
AAAGTCCAGTGCTCGC-1_1,LC_ER+_Foxa1,1.854609,-7.134219
AAAGTCCCACTGGACC-1_1,LC_ER-_LuP,7.263851,6.731787
...,...,...,...
TTTGGAGTCTCTCAAT-1_1,LC_ER-_LuP,2.397697,3.261179
TTTGGTTCACACCGCA-1_1,Myo,-8.270704,-8.207389
TTTGGTTGTCACGACC-1_1,LC_ER-_LuP,2.562882,3.069721
TTTGGTTGTGTACGCC-1_1,LC_ER-_LuP,6.640626,7.261545


In [13]:
# loop for cellID

def add_scvelo_ID(df):
    df = df.copy()
    ids = []
    for idx in df.index.astype(str):
        barcode16 = idx.split("-")[0][:16]
        ids.append(f"possorted_genome_bam_F2ZHW:{barcode16}x")
    df["scvelo_cellID"] = ids
    return df

In [20]:
new_cell_meta_umap = add_scvelo_ID(cell_meta_umap)

In [21]:
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2,scvelo_cellID
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACGAATCAAGTGGG-1_1,LC_ER-_LuP,7.519389,6.254953,possorted_genome_bam_F2ZHW:AAACGAATCAAGTGGGx
AAAGAACTCTAAACGC-1_1,Myo,-7.574637,-7.710628,possorted_genome_bam_F2ZHW:AAAGAACTCTAAACGCx
AAAGGGCTCTGTCAGA-1_1,LC_ER-_LuP,2.844351,3.001735,possorted_genome_bam_F2ZHW:AAAGGGCTCTGTCAGAx
AAAGTCCAGTGCTCGC-1_1,LC_ER+_Foxa1,1.854609,-7.134219,possorted_genome_bam_F2ZHW:AAAGTCCAGTGCTCGCx
AAAGTCCCACTGGACC-1_1,LC_ER-_LuP,7.263851,6.731787,possorted_genome_bam_F2ZHW:AAAGTCCCACTGGACCx
...,...,...,...,...
TTTGGAGTCTCTCAAT-1_1,LC_ER-_LuP,2.397697,3.261179,possorted_genome_bam_F2ZHW:TTTGGAGTCTCTCAATx
TTTGGTTCACACCGCA-1_1,Myo,-8.270704,-8.207389,possorted_genome_bam_F2ZHW:TTTGGTTCACACCGCAx
TTTGGTTGTCACGACC-1_1,LC_ER-_LuP,2.562882,3.069721,possorted_genome_bam_F2ZHW:TTTGGTTGTCACGACCx
TTTGGTTGTGTACGCC-1_1,LC_ER-_LuP,6.640626,7.261545,possorted_genome_bam_F2ZHW:TTTGGTTGTGTACGCCx


In [18]:
print(adata.obs_names[:5])

Index(['possorted_genome_bam_F2ZHW:AAACGCTAGCGTATGGx',
       'possorted_genome_bam_F2ZHW:AAAGTGAAGACATACAx',
       'possorted_genome_bam_F2ZHW:AAATGGACATTGCCGGx',
       'possorted_genome_bam_F2ZHW:AAACGAAGTGATTCTGx',
       'possorted_genome_bam_F2ZHW:AAACGAAGTCATCACAx'],
      dtype='object', name='CellID')


In [22]:
valid_cells = new_cell_meta_umap["scvelo_cellID"].values
adata_sub = adata[adata.obs_names.isin(valid_cells)].copy()

In [23]:
print(f"Original cells: {adata.n_obs}, Filtered cells: {adata_sub.n_obs}")

Original cells: 10572, Filtered cells: 1352


In [24]:
new_cell_meta_umap.index = new_cell_meta_umap['scvelo_cellID']
new_cell_meta_umap = new_cell_meta_umap.drop(['scvelo_cellID'], axis=1)

In [25]:
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
scvelo_cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
possorted_genome_bam_F2ZHW:AAACGAATCAAGTGGGx,LC_ER-_LuP,7.519389,6.254953
possorted_genome_bam_F2ZHW:AAAGAACTCTAAACGCx,Myo,-7.574637,-7.710628
possorted_genome_bam_F2ZHW:AAAGGGCTCTGTCAGAx,LC_ER-_LuP,2.844351,3.001735
possorted_genome_bam_F2ZHW:AAAGTCCAGTGCTCGCx,LC_ER+_Foxa1,1.854609,-7.134219
possorted_genome_bam_F2ZHW:AAAGTCCCACTGGACCx,LC_ER-_LuP,7.263851,6.731787
...,...,...,...
possorted_genome_bam_F2ZHW:TTTGGAGTCTCTCAATx,LC_ER-_LuP,2.397697,3.261179
possorted_genome_bam_F2ZHW:TTTGGTTCACACCGCAx,Myo,-8.270704,-8.207389
possorted_genome_bam_F2ZHW:TTTGGTTGTCACGACCx,LC_ER-_LuP,2.562882,3.069721
possorted_genome_bam_F2ZHW:TTTGGTTGTGTACGCCx,LC_ER-_LuP,6.640626,7.261545


In [26]:
adata_sub.obs = adata_sub.obs.join(new_cell_meta_umap, how="left")

In [27]:
print(adata_sub.obs.head())

                                                 cell_type    umap_1    umap_2
CellID                                                                        
possorted_genome_bam_F2ZHW:AAAGGGCTCTGTCAGAx    LC_ER-_LuP  2.844351  3.001735
possorted_genome_bam_F2ZHW:AAAGTCCAGTGCTCGCx  LC_ER+_Foxa1  1.854609 -7.134219
possorted_genome_bam_F2ZHW:AAAGAACTCTAAACGCx           Myo -7.574637 -7.710628
possorted_genome_bam_F2ZHW:AAACGAATCAAGTGGGx    LC_ER-_LuP  7.519389  6.254953
possorted_genome_bam_F2ZHW:AAAGTCCGTACGAGTGx    LC_ER-_LuP  8.172050  6.475503


In [28]:
adata_sub.write_loom("filtered_CTL.loom")