#### Make the metadata columns consistent between RNA and ATAC finalized objects. For cases where there are any inconsistencies in `06D_filtered_peak_matrix.h5ad` (ATAC object), we will make it cohere with the RNA finalized object (`07_final_RNA_without_scvi.h5ad`)

In [1]:
import scanpy as sc 
import snapatac2 as snap
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
%%time
ATAC_adata = sc.read_h5ad("06D_filtered_peak_matrix.h5ad")
ATAC_adata

CPU times: user 3.85 s, sys: 22.9 s, total: 26.8 s
Wall time: 26.7 s


AnnData object with n_obs × n_vars = 690044 × 654221
    obs: 'ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study', 'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology', 'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type', 'tech_plus_study'
    var: 'count', 'selected'
    uns: 'age_status_colors', 'cell_type_colors', 'leiden', 'leiden_colors', 'neighbors', 'spectral_eigenvalue', 'study_colors'
    obsm: 'X_spectral', 'X_spectral_harmony', 'X_umap'
    obsp: 'connectivities', 'distances'

In [3]:
%%time
RNA_adata = sc.read_h5ad("../../RNA/aggregated_analysis/07_final_RNA_without_scvi.h5ad")
RNA_adata

CPU times: user 14.2 s, sys: 1min 41s, total: 1min 55s
Wall time: 2min 24s


AnnData object with n_obs × n_vars = 2305964 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2', 'v2_scvi_cell_type', 'final_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'

### Check the columns

In [4]:
ATAC_metadata_columns = list(ATAC_adata.obs.columns)
RNA_metadata_columns = list(RNA_adata.obs.columns)

In [5]:
set(ATAC_metadata_columns) - set(RNA_metadata_columns)

{'ATAC_barcode',
 'file',
 'fragment_file',
 'full_path',
 'leiden',
 'nfrag',
 'tsse'}

In [6]:
set(RNA_metadata_columns) - set(ATAC_metadata_columns)

{'UMAP1',
 'UMAP2',
 '_scvi_batch',
 '_scvi_labels',
 'age_group',
 'barcode',
 'cell_or_nuclei',
 'consistent_cell_type',
 'decade',
 'disease',
 'final_cell_type',
 'leiden_scVI',
 'redo_leiden_0.5',
 'scvi_cell_type',
 'v2_scvi_cell_type'}

In [7]:
set(RNA_metadata_columns) & set(ATAC_metadata_columns)

{'age',
 'age_status',
 'cell_type',
 'disease_binary',
 'donor_id',
 'region',
 'sample_id',
 'sex',
 'study',
 'tech_plus_study',
 'technology'}

#### First, we will add any metadata missing in the ATAC_adata that is present in the RNA adata and vice versa:

- Missing in the RNA adata: none, since the columns in the ATAC only are specific to that assay
- Missing in the ATAC adata: cell_or_nuclei (all nuclei); age_group, decade, and disease (AMI for the ATAC), final_cell_type

#### Then we will make sure that the naming scheme is consistent for the ATAC and RNA for the shared columns

In [8]:
def get_age_group(age):
    '''
    Add age group to categorize age into three discrete categories based on the decade of donor
    '''
    if age < 40:
        age_group = "young"
    elif (age >= 40) & (age < 60):
        age_group = "middle"
    elif (age >= 60):
        age_group = "old"
    return(age_group)

In [9]:
%%time
# add age group and decade
ATAC_adata.obs['age_group'] = ATAC_adata.obs['age'].apply(lambda x: get_age_group(x))
# set fetal as an age group
ATAC_adata.obs.loc[ATAC_adata.obs.age_status == "Fetal", 'age_group'] = "fetal"

ATAC_adata.obs['decade'] = np.floor(ATAC_adata.obs['age'] / 10)

CPU times: user 160 ms, sys: 5.19 ms, total: 166 ms
Wall time: 164 ms


In [10]:
ATAC_adata.obs['final_cell_type'] = ATAC_adata.obs['cell_type']

In [11]:
ATAC_adata.obs['cell_or_nuclei'] = "Nuclei"

In [12]:
Counter(RNA_adata.obs.disease)

Counter({'ND': 1212319,
         'DCM': 564000,
         'HCM': 225919,
         'pediatric HF': 92121,
         'ICM': 69611,
         'ARVC': 67834,
         'AMI': 66978,
         'NCCM': 7182})

In [13]:
Counter(ATAC_adata.obs.disease_binary)

Counter({'N': 662539, 'Y': 27505})

In [14]:
# add disease
ATAC_adata.obs['disease'] = "ND"
ATAC_adata.obs.loc[ATAC_adata.obs.disease_binary == "Y", 'disease'] = "AMI"

Now check the ATAC and RNA shared columns

In [15]:
ATAC_metadata_columns = list(ATAC_adata.obs.columns)
RNA_metadata_columns = list(RNA_adata.obs.columns)

set(RNA_metadata_columns) - set(ATAC_metadata_columns)

{'UMAP1',
 'UMAP2',
 '_scvi_batch',
 '_scvi_labels',
 'barcode',
 'consistent_cell_type',
 'leiden_scVI',
 'redo_leiden_0.5',
 'scvi_cell_type',
 'v2_scvi_cell_type'}

In [16]:
shared_metadata_columns = list( set(RNA_metadata_columns) & set(ATAC_metadata_columns) )

In [17]:
for metadata_col in shared_metadata_columns:
    if not metadata_col in ["barcode", "cell_type", "sample_id", "donor_id"]:
        print(metadata_col)
        
        print("only in ATAC:")
        print(set(ATAC_adata.obs[metadata_col]) - set(RNA_adata.obs[metadata_col]))
        
        print("only in RNA:")
        print(set(RNA_adata.obs[metadata_col]) - set(ATAC_adata.obs[metadata_col]))
        
        print("shared:") 
        print(set(RNA_adata.obs[metadata_col]) & set(ATAC_adata.obs[metadata_col]))
        
        print("\n")

tech_plus_study
only in ATAC:
{'nan_Ameen 2022', 'nan_Kuppe 2022', 'nan_ENCODE v4 (Snyder)', 'nan_Kanemaru 2023', '10X_ATAC_Penn'}
only in RNA:
{'Multiome-v1_Kanemaru 2023', '3prime-v2_Litvinukova 2020', '3prime-v3_Simonson 2023', '3prime-v3_Kuppe 2022', '3prime-v3_Reichart 2022', 'Multiome-v1_ENCODE v4 (Snyder)', '3prime-v3_Hill 2022', '3prime-v3_Chaffin 2022', '3prime-v1_Sim 2021', '5prime-v1_Koenig 2022', '3prime-v3_Litvinukova 2020', 'Dropseq_Penn'}
shared:
set()


disease
only in ATAC:
set()
only in RNA:
{'NCCM', 'DCM', 'ICM', 'HCM', 'ARVC', 'pediatric HF'}
shared:
{'ND', 'AMI'}


final_cell_type
only in ATAC:
set()
only in RNA:
{'Endocardial', 'LEC'}
shared:
{'Pericyte', 'Fibroblast', 'Mast', 'Neuronal', 'Cardiomyocyte', 'vSMC', 'Epicardial', 'Endothelial', 'Adipocyte', 'Myeloid', 'Lymphoid'}


study
only in ATAC:
{'Ameen 2022'}
only in RNA:
{'Sim 2021', 'Simonson 2023', 'Koenig 2022', 'Reichart 2022', 'Chaffin 2022', 'Hill 2022', 'Litvinukova 2020'}
shared:
{'Penn', 'Kanemaru 20

### Make the metadata consistent

- age_status: make Postnatal and Fetal in ATAC lowercase
- technology, tech_plus_study: investigate why some of them are nan and fix this

In [18]:
%%time

# make age status lowercase
ATAC_adata.obs['age_status'] = ATAC_adata.obs['age_status'].str.lower()

CPU times: user 12 ms, sys: 60 μs, total: 12.1 ms
Wall time: 10.8 ms


In [19]:
Counter(ATAC_adata.obs.study)

Counter({'ENCODE v4 (Snyder)': 494790,
         'Penn': 117876,
         'Kuppe 2022': 34319,
         'Kanemaru 2023': 28952,
         'Ameen 2022': 14107})

In [20]:
Counter(ATAC_adata.obs.technology)

Counter({nan: 572168, '10X_ATAC': 117876})

In [21]:
# reassign the technology based on the studies
ATAC_adata.obs['technology'] = ATAC_adata.obs['study'].map(
    {
        "ENCODE v4 (Snyder)": "Multiome-v1", 
        "Penn": "10X_ATAC",
        "Kuppe 2022": "10X_ATAC",
        "Kanemaru 2023": "Multiome-v1",
        "Ameen 2022": "10X_ATAC"
    }
)

In [22]:
Counter(ATAC_adata.obs.technology)

Counter({'Multiome-v1': 523742, '10X_ATAC': 166302})

In [23]:
ATAC_adata.obs['tech_plus_study'] = ATAC_adata.obs['technology'].astype(str) + "_" + ATAC_adata.obs['study'].astype(str)

### Recheck the consistency of the metadata

In [24]:
shared_metadata_columns = list( set(RNA_metadata_columns) & set(ATAC_metadata_columns) )

In [25]:
for metadata_col in shared_metadata_columns:
    if not metadata_col in ["barcode", "cell_type", "sample_id", "donor_id"]:
        print(metadata_col)
        
        print("only in ATAC:")
        print(set(ATAC_adata.obs[metadata_col]) - set(RNA_adata.obs[metadata_col]))
        
        print("only in RNA:")
        print(set(RNA_adata.obs[metadata_col]) - set(ATAC_adata.obs[metadata_col]))
        
        print("shared:") 
        print(set(RNA_adata.obs[metadata_col]) & set(ATAC_adata.obs[metadata_col]))
        
        print("\n")

tech_plus_study
only in ATAC:
{'10X_ATAC_Ameen 2022', '10X_ATAC_Penn', '10X_ATAC_Kuppe 2022'}
only in RNA:
{'3prime-v2_Litvinukova 2020', '3prime-v3_Simonson 2023', '3prime-v3_Kuppe 2022', '3prime-v3_Reichart 2022', '3prime-v3_Hill 2022', '3prime-v3_Chaffin 2022', '3prime-v1_Sim 2021', '5prime-v1_Koenig 2022', '3prime-v3_Litvinukova 2020', 'Dropseq_Penn'}
shared:
{'Multiome-v1_Kanemaru 2023', 'Multiome-v1_ENCODE v4 (Snyder)'}


disease
only in ATAC:
set()
only in RNA:
{'NCCM', 'DCM', 'ICM', 'HCM', 'ARVC', 'pediatric HF'}
shared:
{'ND', 'AMI'}


final_cell_type
only in ATAC:
set()
only in RNA:
{'Endocardial', 'LEC'}
shared:
{'Pericyte', 'Fibroblast', 'Mast', 'Neuronal', 'Cardiomyocyte', 'vSMC', 'Epicardial', 'Endothelial', 'Adipocyte', 'Myeloid', 'Lymphoid'}


study
only in ATAC:
{'Ameen 2022'}
only in RNA:
{'Sim 2021', 'Simonson 2023', 'Koenig 2022', 'Reichart 2022', 'Chaffin 2022', 'Hill 2022', 'Litvinukova 2020'}
shared:
{'Penn', 'Kanemaru 2023', 'ENCODE v4 (Snyder)', 'Kuppe 2022'}



### Now that the metadata is consistent with the RNA adata, save this final ATAC adata

In [26]:
ATAC_adata

AnnData object with n_obs × n_vars = 690044 × 654221
    obs: 'ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study', 'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology', 'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type', 'tech_plus_study', 'age_group', 'decade', 'final_cell_type', 'cell_or_nuclei', 'disease'
    var: 'count', 'selected'
    uns: 'age_status_colors', 'cell_type_colors', 'leiden', 'leiden_colors', 'neighbors', 'spectral_eigenvalue', 'study_colors'
    obsm: 'X_spectral', 'X_spectral_harmony', 'X_umap'
    obsp: 'connectivities', 'distances'

In [None]:
%%time
ATAC_adata.write("07_final_ATAC.h5ad")

... storing 'age_status' as categorical
... storing 'technology' as categorical
... storing 'tech_plus_study' as categorical
... storing 'age_group' as categorical
... storing 'cell_or_nuclei' as categorical
... storing 'disease' as categorical
