In [1]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from collections import Counter
import scanpy.external as sce

In [2]:
%%time
# load each of the individual technology datasets
v2_adata = sc.read_h5ad("processed_10x_GEX_v2_adata.h5ad")
v3_adata = sc.read_h5ad("processed_10x_GEX_v3_adata.h5ad")
Multiome_adata = sc.read_h5ad("processed_Multiome_adata.h5ad")

CPU times: user 881 ms, sys: 1.81 s, total: 2.69 s
Wall time: 2.69 s


In [3]:
%%time
adata = sc.concat([v2_adata, v3_adata, Multiome_adata])

CPU times: user 629 ms, sys: 762 ms, total: 1.39 s
Wall time: 1.39 s


In [4]:
Counter(adata.obs.donor_id)

Counter({'AH1-Nuclei_Multiome-v1': 16057,
         'H6-Nuclei_3prime-v3': 11888,
         'H3-Nuclei_3prime-v3': 10042,
         'H5-Nuclei_3prime-v3': 9230,
         'D2-Nuclei_3prime-v2': 7856,
         'D1-Nuclei_3prime-v2': 7562,
         'H4-Nuclei_3prime-v3': 7376,
         'H7-Nuclei_3prime-v3': 7110,
         'D3-Nuclei_3prime-v2': 6750,
         'D8-Nuclei_Multiome-v1': 6610,
         'D7-Nuclei_Multiome-v1': 6333,
         'H2-Nuclei_3prime-v3': 5966,
         'D5-Nuclei_3prime-v2': 5419,
         'D4-Nuclei_3prime-v2': 4418,
         'D7-Nuclei_3prime-v2': 2835,
         'D3-Nuclei_Multiome-v1': 2657,
         'D6-Nuclei_3prime-v2': 1618,
         'D11-Nuclei_3prime-v3': 343})

In [8]:
sorted(adata.obs.donor_id.unique())

['AH1-Nuclei_Multiome-v1',
 'D1-Nuclei_3prime-v2',
 'D11-Nuclei_3prime-v3',
 'D2-Nuclei_3prime-v2',
 'D3-Nuclei_3prime-v2',
 'D3-Nuclei_Multiome-v1',
 'D4-Nuclei_3prime-v2',
 'D5-Nuclei_3prime-v2',
 'D6-Nuclei_3prime-v2',
 'D7-Nuclei_3prime-v2',
 'D7-Nuclei_Multiome-v1',
 'D8-Nuclei_Multiome-v1',
 'H2-Nuclei_3prime-v3',
 'H3-Nuclei_3prime-v3',
 'H4-Nuclei_3prime-v3',
 'H5-Nuclei_3prime-v3',
 'H6-Nuclei_3prime-v3',
 'H7-Nuclei_3prime-v3']

In [9]:
len(adata.obs.donor_id.unique())

18

There are 16 unique donors, as D3 was performed using 3' v2 and Multiome. D7 was also performed using 3' v2 and Multiome.

In [12]:
Counter(adata.obs.consistent_cell_type)

Counter({'Cardiomyocyte': 65392,
         'Pericyte': 22231,
         'Fibroblast': 17309,
         'Endothelial': 8374,
         'Myeloid': 4288,
         'Lymphoid': 923,
         'Neuronal': 823,
         'Mast': 438,
         'Adipocyte': 207,
         'LEC': 73,
         'Epicardial': 12})

In [13]:
# make sure the counts are the raw counts
adata.X = adata.layers['counts']

In [14]:
adata.X.sum(axis = 1)

matrix([[20554.],
        [18421.],
        [17166.],
        ...,
        [  497.],
        [  498.],
        [  492.]], dtype=float32)

### Examine the cell barcodes

In [20]:
adata.obs_names.str.split("_").str[0]

Index(['HCAHeart7664652', 'HCAHeart7664652', 'HCAHeart7664652',
       'HCAHeart7664652', 'HCAHeart7664652', 'HCAHeart7664652',
       'HCAHeart7664652', 'HCAHeart7664652', 'HCAHeart7664652',
       'HCAHeart7664652',
       ...
       'HCAHeartST11064575', 'HCAHeartST11064575', 'HCAHeartST11064575',
       'HCAHeartST11064575', 'HCAHeartST11064575', 'HCAHeartST11064575',
       'HCAHeartST11064575', 'HCAHeartST11064575', 'HCAHeartST11064575',
       'HCAHeartST11064575'],
      dtype='object', name='barcode', length=120070)

In [31]:
# get the sample id as the name right before the "_" 
adata.obs["sample_id"] = adata.obs_names.str.rsplit("_", n=1).str[0]
adata.obs["barcode"] = adata.obs_names.str.rsplit("_", n=1).str[-1]

In [32]:
len(adata.obs["sample_id"].unique())

25

Using sample_id is unique, while donor_id is NOT. This is because is D1, D2, and AH1 have duplicates 

In [37]:
adata.obs['sample_id_plus_barcode'] = adata.obs.sample_id.astype(str) + ":" + adata.obs["barcode"]

In [38]:
Counter(adata.obs['sample_id_plus_barcode'])

Counter({'HCAHeart7664652:AGCCTAACAATCCGAT-1': 1,
         'HCAHeart7664652:CGGGTCATCTAAGCCA-1': 1,
         'HCAHeart7664652:GTCAAGTTCACAAACC-1': 1,
         'HCAHeart7664652:TACAGTGGTGATAAAC-1': 1,
         'HCAHeart7664652:CTAACTTGTATTCGTG-1': 1,
         'HCAHeart7664652:CTAGCCTCAAAGGAAG-1': 1,
         'HCAHeart7664652:CGAATGTCAGCTGGCT-1': 1,
         'HCAHeart7664652:CGTCAGGGTTGTTTGG-1': 1,
         'HCAHeart7664652:GAAACTCGTCCATCCT-1': 1,
         'HCAHeart7664652:GAGCAGATCCGCGGTA-1': 1,
         'HCAHeart7664652:CCTACACCATCTGGTA-1': 1,
         'HCAHeart7664652:GGATTACTCTTCATGT-1': 1,
         'HCAHeart7664652:TTAACTCAGTGACTCT-1': 1,
         'HCAHeart7664652:TTCGGTCTCAGGCCCA-1': 1,
         'HCAHeart7664652:GCTGCAGTCTGCTGTC-1': 1,
         'HCAHeart7664652:GTAACTGGTGCCTGTG-1': 1,
         'HCAHeart7664652:TTAGGACGTCCAACTA-1': 1,
         'HCAHeart7664652:CCTTTCTAGATGCCAG-1': 1,
         'HCAHeart7664652:TCTTCGGAGAACAACT-1': 1,
         'HCAHeart7664652:ATGCGATTCGCTTAGA-1': 1,


In [35]:
adata.obs['donor_id_plus_barcode'] = adata.obs.donor_id.astype(str) + ":" + adata.obs["barcode"]

In [36]:
Counter(adata.obs['donor_id_plus_barcode'])

Counter({'D1-Nuclei_3prime-v2:TCACAAGCACACTGCG-1': 2,
         'D1-Nuclei_3prime-v2:CGGGTCACAGCGAACA-1': 2,
         'D1-Nuclei_3prime-v2:CAGCCGAAGTACGTTC-1': 2,
         'D1-Nuclei_3prime-v2:CTCCTAGTCACATACG-1': 2,
         'D1-Nuclei_3prime-v2:TAGAGCTCATTAACCG-1': 2,
         'D1-Nuclei_3prime-v2:CTCACACGTCCGAGTC-1': 2,
         'D1-Nuclei_3prime-v2:CATTCGCTCACCTTAT-1': 2,
         'D1-Nuclei_3prime-v2:GTGCGGTAGATCCCAT-1': 2,
         'D1-Nuclei_3prime-v2:AGCATACGTAAGGGCT-1': 2,
         'D1-Nuclei_3prime-v2:GTCATTTGTAAGTAGT-1': 2,
         'D1-Nuclei_3prime-v2:AGACGTTGTAAGTTCC-1': 2,
         'D1-Nuclei_3prime-v2:CTGCCTAAGGTGTTAA-1': 2,
         'D1-Nuclei_3prime-v2:CATTATCGTCGAGATG-1': 2,
         'D1-Nuclei_3prime-v2:CCCAATCGTAGAGCTG-1': 2,
         'D1-Nuclei_3prime-v2:AAACCTGAGGGATACC-1': 2,
         'D1-Nuclei_3prime-v2:TCAATCTAGCCCGAAA-1': 2,
         'D2-Nuclei_3prime-v2:CTCTGGTCATCACGTA-1': 2,
         'D2-Nuclei_3prime-v2:TTGAACGGTACGCTGC-1': 2,
         'D2-Nuclei_3prime-v

Check if these cells should be collapsed together (if there are just resequencing from the same library). In other words, are they are the same nuclei? As a quick examination, we will spot-check a few of those with the same donor_id_plus_barcode and see if they have the same cell type annotation. It turns out this is NOT the case, so these are likely different nuclei from different runs, rather than resequencing of the same library. 

In [47]:
adata[adata.obs.donor_id_plus_barcode == "AH1-Nuclei_Multiome-v1:TAGCCTGAGTTAGTTG-1"].obs

Unnamed: 0_level_0,donor_id,age,sex,cell_type,technology,region,study,cell_or_nuclei,consistent_cell_type,n_genes_by_counts,...,log1p_total_counts_hb,pct_counts_hb,n_genes,doublet_score,predicted_doublet,leiden,sample_id,barcode,donor_id_plus_barcode,sample_id_plus_barcode
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HCAHeartST10773166_HCAHeartST10781063_TAGCCTGAGTTAGTTG-1,AH1-Nuclei_Multiome-v1,47.5,female,Endothelial cell,Multiome-v1,LV,Kanemaru 2023,Nuclei,Endothelial,517,...,0.0,0.0,517,0.012787,False,9,HCAHeartST10773166_HCAHeartST10781063,TAGCCTGAGTTAGTTG-1,AH1-Nuclei_Multiome-v1:TAGCCTGAGTTAGTTG-1,HCAHeartST10773166_HCAHeartST10781063:TAGCCTGA...
HCAHeartST11064575_HCAHeartST11023240_TAGCCTGAGTTAGTTG-1,AH1-Nuclei_Multiome-v1,47.5,female,Mural cell,Multiome-v1,LV,Kanemaru 2023,Nuclei,Pericyte,1168,...,0.0,0.0,1168,0.022912,False,11,HCAHeartST11064575_HCAHeartST11023240,TAGCCTGAGTTAGTTG-1,AH1-Nuclei_Multiome-v1:TAGCCTGAGTTAGTTG-1,HCAHeartST11064575_HCAHeartST11023240:TAGCCTGA...


#### Use sample_id_plus_barcode as the adata.obs_names

In [48]:
adata.obs_names = adata.obs['sample_id_plus_barcode']

In [49]:
adata.write("processed_LV_all.h5ad")