In [2]:
import scvelo as scv
import pandas as pd

In [3]:
adata = scv.read("../01_Input_file/KitPik_raw.loom", cache=False)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [4]:
adata.var_names_make_unique()   # fix duplicate gene names

In [5]:
adata.obs

possorted_genome_bam_Q7E7W:AAAGGATTCGTAACTGx
possorted_genome_bam_Q7E7W:AAACGAATCCAGCAATx
possorted_genome_bam_Q7E7W:AAACGCTTCGAACTCAx
possorted_genome_bam_Q7E7W:AAAGGGCCAAGCGGATx
possorted_genome_bam_Q7E7W:AAAGAACTCGAACCTAx
...
possorted_genome_bam_Q7E7W:TTTGTTGTCGTTGCCTx
possorted_genome_bam_Q7E7W:TTTGTTGGTTTCGCTCx
possorted_genome_bam_Q7E7W:TTTGTTGTCTCACTCGx
possorted_genome_bam_Q7E7W:TTTGGTTGTACTGTTGx
possorted_genome_bam_Q7E7W:TTTGGTTAGAACCCGAx


In [6]:
cell_meta = pd.read_csv("meta_Kit_Pik.csv")
cell_umap = pd.read_csv("umap_Kit_Pik.csv", sep=',')
cell_meta_umap = pd.merge(cell_meta, cell_umap, on="CellID", how="inner")

cell_meta_umap

Unnamed: 0,CellID,cell_type,umap_1,umap_2
0,AAACCCAAGCCTAACT-1_2,LC_ER-_LuP,4.340409,3.598367
1,AAACCCAAGTAGGTTA-1_2,LC_ER-_LuP,4.656558,3.029929
2,AAACCCAAGTGCCGAA-1_2,LC_ER-_LaP,5.102678,7.060255
3,AAACCCACAGCAGATG-1_2,LC_ER-_LuP,5.902091,4.927938
4,AAACCCACATGGAATA-1_2,LC_ER-_LuP,3.184937,3.071098
...,...,...,...,...
6017,TTTGTTGGTTTCGCTC-1_2,LC_ER-_LuP,5.596863,4.484584
6018,TTTGTTGTCCCATAAG-1_2,LC_ER-_LuP,2.975631,2.102163
6019,TTTGTTGTCCCTTCCC-1_2,LC_ER-_LaP,2.599695,5.917958
6020,TTTGTTGTCGTTGCCT-1_2,LC_ER-_LuP,5.374636,2.943366


In [7]:
cell_meta_umap.index = cell_meta_umap['CellID']
cell_meta_umap = cell_meta_umap.drop(['CellID'], axis=1)

cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAACCCAAGCCTAACT-1_2,LC_ER-_LuP,4.340409,3.598367
AAACCCAAGTAGGTTA-1_2,LC_ER-_LuP,4.656558,3.029929
AAACCCAAGTGCCGAA-1_2,LC_ER-_LaP,5.102678,7.060255
AAACCCACAGCAGATG-1_2,LC_ER-_LuP,5.902091,4.927938
AAACCCACATGGAATA-1_2,LC_ER-_LuP,3.184937,3.071098
...,...,...,...
TTTGTTGGTTTCGCTC-1_2,LC_ER-_LuP,5.596863,4.484584
TTTGTTGTCCCATAAG-1_2,LC_ER-_LuP,2.975631,2.102163
TTTGTTGTCCCTTCCC-1_2,LC_ER-_LaP,2.599695,5.917958
TTTGTTGTCGTTGCCT-1_2,LC_ER-_LuP,5.374636,2.943366


In [8]:
# loop for cellID

def add_scvelo_ID(df):
    df = df.copy()
    ids = []
    for idx in df.index.astype(str):
        barcode16 = idx.split("-")[0][:16]
        ids.append(f"possorted_genome_bam_Q7E7W:{barcode16}x")
    df["scvelo_cellID"] = ids
    return df

In [9]:
new_cell_meta_umap = add_scvelo_ID(cell_meta_umap)
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2,scvelo_cellID
CellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCCAAGCCTAACT-1_2,LC_ER-_LuP,4.340409,3.598367,possorted_genome_bam_Q7E7W:AAACCCAAGCCTAACTx
AAACCCAAGTAGGTTA-1_2,LC_ER-_LuP,4.656558,3.029929,possorted_genome_bam_Q7E7W:AAACCCAAGTAGGTTAx
AAACCCAAGTGCCGAA-1_2,LC_ER-_LaP,5.102678,7.060255,possorted_genome_bam_Q7E7W:AAACCCAAGTGCCGAAx
AAACCCACAGCAGATG-1_2,LC_ER-_LuP,5.902091,4.927938,possorted_genome_bam_Q7E7W:AAACCCACAGCAGATGx
AAACCCACATGGAATA-1_2,LC_ER-_LuP,3.184937,3.071098,possorted_genome_bam_Q7E7W:AAACCCACATGGAATAx
...,...,...,...,...
TTTGTTGGTTTCGCTC-1_2,LC_ER-_LuP,5.596863,4.484584,possorted_genome_bam_Q7E7W:TTTGTTGGTTTCGCTCx
TTTGTTGTCCCATAAG-1_2,LC_ER-_LuP,2.975631,2.102163,possorted_genome_bam_Q7E7W:TTTGTTGTCCCATAAGx
TTTGTTGTCCCTTCCC-1_2,LC_ER-_LaP,2.599695,5.917958,possorted_genome_bam_Q7E7W:TTTGTTGTCCCTTCCCx
TTTGTTGTCGTTGCCT-1_2,LC_ER-_LuP,5.374636,2.943366,possorted_genome_bam_Q7E7W:TTTGTTGTCGTTGCCTx


In [10]:
valid_cells = new_cell_meta_umap["scvelo_cellID"].values
adata_sub = adata[adata.obs_names.isin(valid_cells)].copy()

In [11]:
print(f"Original cells: {adata.n_obs}, Filtered cells: {adata_sub.n_obs}")

Original cells: 12929, Filtered cells: 6022


In [12]:
new_cell_meta_umap.index = new_cell_meta_umap['scvelo_cellID']
new_cell_meta_umap = new_cell_meta_umap.drop(['scvelo_cellID'], axis=1)

In [13]:
new_cell_meta_umap

Unnamed: 0_level_0,cell_type,umap_1,umap_2
scvelo_cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
possorted_genome_bam_Q7E7W:AAACCCAAGCCTAACTx,LC_ER-_LuP,4.340409,3.598367
possorted_genome_bam_Q7E7W:AAACCCAAGTAGGTTAx,LC_ER-_LuP,4.656558,3.029929
possorted_genome_bam_Q7E7W:AAACCCAAGTGCCGAAx,LC_ER-_LaP,5.102678,7.060255
possorted_genome_bam_Q7E7W:AAACCCACAGCAGATGx,LC_ER-_LuP,5.902091,4.927938
possorted_genome_bam_Q7E7W:AAACCCACATGGAATAx,LC_ER-_LuP,3.184937,3.071098
...,...,...,...
possorted_genome_bam_Q7E7W:TTTGTTGGTTTCGCTCx,LC_ER-_LuP,5.596863,4.484584
possorted_genome_bam_Q7E7W:TTTGTTGTCCCATAAGx,LC_ER-_LuP,2.975631,2.102163
possorted_genome_bam_Q7E7W:TTTGTTGTCCCTTCCCx,LC_ER-_LaP,2.599695,5.917958
possorted_genome_bam_Q7E7W:TTTGTTGTCGTTGCCTx,LC_ER-_LuP,5.374636,2.943366


In [14]:
adata_sub.obs = adata_sub.obs.join(new_cell_meta_umap, how="left")

In [15]:
print(adata_sub.obs.head())

                                               cell_type    umap_1    umap_2
CellID                                                                      
possorted_genome_bam_Q7E7W:AAACGAATCCAGCAATx  LC_ER-_LaP  2.019011  3.735029
possorted_genome_bam_Q7E7W:AAACGCTTCGAACTCAx  LC_ER-_LaP  5.438486  6.903892
possorted_genome_bam_Q7E7W:AAAGAACTCGAACCTAx  LC_ER-_LaP  4.851584  7.372660
possorted_genome_bam_Q7E7W:AAAGGATCACCCTTACx  LC_ER-_LuP  3.961037  3.916472
possorted_genome_bam_Q7E7W:AAACCCATCCAACCGGx  LC_ER-_LaP  2.804697  5.086157


In [16]:
adata_sub.write_loom("filtered_Kit_Pik.loom")