### Make sure all of the metadata for RNA adata is correct

In [1]:
import scanpy as sc 
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
%%time
adata = sc.read_h5ad("05E_all_snRNA_adata.h5ad")
adata

CPU times: user 24.3 s, sys: 2min 55s, total: 3min 20s
Wall time: 5min 38s


AnnData object with n_obs × n_vars = 2305964 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2', 'v2_scvi_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'scvi_normalized'

#### Fix the decade column

In [3]:
adata.obs['decade'] = np.floor(adata.obs['age'] / 10)

### Check other metadata columns

In [4]:
Counter(adata.obs.decade)

Counter({5.0: 621503,
         6.0: 563720,
         4.0: 465623,
         1.0: 164857,
         7.0: 137635,
         2.0: 126231,
         3.0: 113807,
         0.0: 92942,
         8.0: 10546,
         9.0: 9100})

In [5]:
Counter(adata.obs.technology)

Counter({'3prime-v3': 1417789,
         'Multiome-v1': 510638,
         '5prime-v1': 194772,
         'Dropseq': 110943,
         '3prime-v1': 40889,
         '3prime-v2': 30933})

In [6]:
Counter(adata.obs.disease)

Counter({'ND': 1212319,
         'DCM': 564000,
         'HCM': 225919,
         'pediatric HF': 92121,
         'ICM': 69611,
         'ARVC': 67834,
         'AMI': 66978,
         'NCCM': 7182})

In [7]:
Counter(adata.obs.disease_binary)

Counter({'N': 1212319, 'Y': 1093645})

In [8]:
Counter(adata.obs.study)

Counter({'Chaffin 2022': 577886,
         'ENCODE v4 (Snyder)': 484348,
         'Reichart 2022': 444406,
         'Koenig 2022': 194772,
         'Kuppe 2022': 140044,
         'Penn': 110943,
         'Hill 2022': 109479,
         'Simonson 2023': 94803,
         'Litvinukova 2020': 82104,
         'Sim 2021': 40889,
         'Kanemaru 2023': 26290})

In [9]:
Counter(adata.obs.region)

Counter({'LV': 2262239, 'WH': 23325, 'OFT': 12392, 'Atria': 6887, 'LRV': 1121})

In [10]:
Counter(adata.obs.sex)

Counter({'male': 1430372, 'female': 875592})

In [11]:
Counter(adata.obs.age_group)

Counter({'middle': 1087126, 'old': 721001, 'young': 426890, 'fetal': 70947})

In [12]:
Counter(adata.obs.tech_plus_study)

Counter({'3prime-v3_Chaffin 2022': 577886,
         'Multiome-v1_ENCODE v4 (Snyder)': 484348,
         '3prime-v3_Reichart 2022': 444406,
         '5prime-v1_Koenig 2022': 194772,
         '3prime-v3_Kuppe 2022': 140044,
         'Dropseq_Penn': 110943,
         '3prime-v3_Hill 2022': 109479,
         '3prime-v3_Simonson 2023': 94803,
         '3prime-v3_Litvinukova 2020': 51171,
         '3prime-v1_Sim 2021': 40889,
         '3prime-v2_Litvinukova 2020': 30933,
         'Multiome-v1_Kanemaru 2023': 26290})

#### Assign `v2_scvi_cell_type` as `final_cell_type`

In [13]:
adata.obs['final_cell_type'] = adata.obs['v2_scvi_cell_type']

In [14]:
adata

AnnData object with n_obs × n_vars = 2305964 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2', 'v2_scvi_cell_type', 'final_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts', 'scvi_normalized'

### Save this final adata, with and without scvi layer

In [15]:
%%time
adata.write("07_final_RNA.h5ad")

CPU times: user 47.4 s, sys: 2min 29s, total: 3min 17s
Wall time: 3min 41s


In [16]:
%%time
del adata.layers['scvi_normalized']

CPU times: user 6.8 ms, sys: 2 s, total: 2.01 s
Wall time: 1.98 s


In [17]:
adata.write("07_final_RNA_without_scvi.h5ad")