### Combine all of the scRNA-seq datasets

In [13]:
import scanpy as sc 
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
metadata_to_keep = ["age", 
                    "donor_id", 
                    "sex", 
                    "region", 
                    "cell_type",
                    "consistent_cell_type",
                    "study", 
                    "technology", 
                    "cell_or_nuclei"]

In [3]:
def confirm_raw_counts(adata):
    """
    Check if every value in adata.X.sum(axis=1), the sum of counts across all genes per cell, is an integer.   

    Parameters:
    adata (AnnData): The AnnData object containing the data matrix (adata.X).

    Returns:
    bool: True if all sums are integers, False otherwise.
    """
    # Sum across rows (axis=1), which correspond to the genes
    sums = adata.X.sum(axis=1)
    
    # Check if all sums are integers by comparing them to their rounded versions
    return np.all(np.equal(sums, np.round(sums)))

### Teichman dataset

In [5]:
Teichman_adata = sc.read_h5ad("../external_datasets/Heart_Atlas_v2/Teichman_LV_cell.h5ad")
Teichman_adata.X = Teichman_adata.layers['counts']
print("All raw counts?: " + str (confirm_raw_counts(Teichman_adata)) )
Teichman_adata.obs = Teichman_adata.obs[metadata_to_keep]

All raw counts?: True


### Koenig dataset

In [7]:
Koenig_adata = sc.read_h5ad("../external_datasets/Koenig_et_al_2022/processed_Koenig_cell.h5ad")
Koenig_adata.X = Koenig_adata.layers['counts']
print("All raw counts?: " + str (confirm_raw_counts(Koenig_adata)) )
Koenig_adata.obs = Koenig_adata.obs[metadata_to_keep]

All raw counts?: True


### Combine these datasets together

In [8]:
%%time
adata = sc.concat([Teichman_adata, Koenig_adata])
adata

CPU times: user 3.12 s, sys: 1.18 s, total: 4.29 s
Wall time: 4.29 s


AnnData object with n_obs × n_vars = 60819 × 31846
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei'
    obsm: 'X_umap'
    layers: 'counts'

### Save this adata and perform scVI integration in the next script

In [14]:
Counter(adata.obs.donor_id)

Counter({'D11-Cell_3prime-v3': 11868,
         'HDCM4': 6390,
         'HDCM1': 6376,
         'HDCM3': 6236,
         'HDCM5': 5911,
         'D6-Cell_3prime-v2': 5834,
         'HDCM6': 5788,
         'D6-Cell_3prime-v3': 4406,
         'HDCM7': 4315,
         'D7-Cell_3prime-v2': 1586,
         'D5-Cell_3prime-v2': 1246,
         'D3-Cell_3prime-v2': 858,
         'D4-Cell_3prime-v2': 5})

#### Drop the donors with less than 500 cells

In [16]:
%%time
# keep donors with > 500
donor_counts = Counter(adata.obs['donor_id'])
valid_donors = [donor for donor, count in donor_counts.items() if count >= 500]
adata = adata[adata.obs['donor_id'].isin(valid_donors)].copy()

CPU times: user 863 ms, sys: 1.15 s, total: 2.01 s
Wall time: 2.01 s


In [17]:
Counter(adata.obs.donor_id)

Counter({'D11-Cell_3prime-v3': 11868,
         'HDCM4': 6390,
         'HDCM1': 6376,
         'HDCM3': 6236,
         'HDCM5': 5911,
         'D6-Cell_3prime-v2': 5834,
         'HDCM6': 5788,
         'D6-Cell_3prime-v3': 4406,
         'HDCM7': 4315,
         'D7-Cell_3prime-v2': 1586,
         'D5-Cell_3prime-v2': 1246,
         'D3-Cell_3prime-v2': 858})

### Add additional metadata

In [20]:
adata.obs['tech_plus_study'] = adata.obs['technology'].astype(str) + "_" + adata.obs['study'].astype(str)

In [22]:
Counter(adata.obs['tech_plus_study'])

Counter({'5prime-v1_Koenig 2022': 35016,
         '3prime-v3_Teichman': 16274,
         '3prime-v2_Teichman': 9524})

In [21]:
%%time
adata.write("01_combined_scRNA.h5ad")

CPU times: user 382 ms, sys: 1.9 s, total: 2.28 s
Wall time: 2.55 s
