In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
from collections import Counter
import re
import scanpy.external as sce

Load in the scRNA + snRNA Koenig adata

In [2]:
adata = sc.read_h5ad("Koenig_all_sc_snRNA.h5ad")

In [3]:
adata.X.sum(axis = 1)

matrix([[1245],
        [1464],
        [1668],
        ...,
        [5454],
        [5947],
        [8425]])

In [4]:
# store the raw counts
adata.layers["counts"] = adata.X

In [5]:
adata.obs.head()

Unnamed: 0,Sex,Names,orig.ident,condition,tech
H_ZC-11-292_TAAGTGCAGCAGGTCA,Male,Endocardium,H_ZC-11-292,Donor,SN
H_ZC-11-292_ACAGCCGGTCATACTG,Male,Endocardium,H_ZC-11-292,Donor,SN
H_ZC-11-292_AACTCCCTCTTTAGTC,Male,Endocardium,H_ZC-11-292,Donor,SN
H_ZC-11-292_CTGATCCGTTATTCTC,Male,Endocardium,H_ZC-11-292,Donor,SN
H_ZC-11-292_GCACTCTTCTCGTATT,Male,Endocardium,H_ZC-11-292,Donor,SN


Add age information from donor metadata file

In [6]:
donor_information_df = pd.read_csv("00_donor_metadata.txt", delim_whitespace=True, header = None)
donor_information_df.columns = ["donor_id", "age", "sex", "disease"]
donor_information_df.head()

Unnamed: 0,donor_id,age,sex,disease
0,H_ZC-11-292,68.0,male,Healthy
1,H_ZC-LVAD,58.0,male,DCM
2,TWCM-10-5,53.0,female,DCM
3,TWCM-10-68,62.0,female,Healthy
4,TWCM-11-3,,female,DCM


In [7]:
# add the age information by merging 
adata.obs = adata.obs.reset_index()
merged_df = adata.obs.merge(donor_information_df, left_on = "orig.ident", right_on = "donor_id")
merged_df.index = merged_df['index']

In [8]:
adata.obs.index = adata.obs['index']
adata.obs = adata.obs.drop(columns = ["index"])

In [9]:
adata = adata[merged_df.index, :].copy()
adata.obs = merged_df
adata.obs = adata.obs.drop(columns = "index")

In [10]:
# extract the relevant metadata 
metadata = adata.obs[['sex', 'Names', 'donor_id', 'age', 'disease', 'tech']]

# for later consistency, change the column names
metadata = metadata.rename(columns={'Names': 'cell_type',
                                    'tech': 'cell_or_nuclei'})

# add additional metadata columns
metadata['study'] = 'Koenig 2022'
metadata['sex'] = metadata['sex'].str.lower() # make sex lowercase 
metadata['technology'] = '5prime-v1'
metadata['region'] = 'LV'

# reformat cell and nuclei
metadata['cell_or_nuclei'] = metadata['cell_or_nuclei'].map(
    {'SC': 'Cell',
    'SN': 'Nuclei'}
)

# add back the metadata
adata.obs = metadata
adata.obs.head()

Unnamed: 0_level_0,sex,cell_type,donor_id,age,disease,cell_or_nuclei,study,technology,region
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
H_ZC-11-292_TAAGTGCAGCAGGTCA,male,Endocardium,H_ZC-11-292,68.0,Healthy,Nuclei,Koenig 2022,5prime-v1,LV
H_ZC-11-292_ACAGCCGGTCATACTG,male,Endocardium,H_ZC-11-292,68.0,Healthy,Nuclei,Koenig 2022,5prime-v1,LV
H_ZC-11-292_AACTCCCTCTTTAGTC,male,Endocardium,H_ZC-11-292,68.0,Healthy,Nuclei,Koenig 2022,5prime-v1,LV
H_ZC-11-292_CTGATCCGTTATTCTC,male,Endocardium,H_ZC-11-292,68.0,Healthy,Nuclei,Koenig 2022,5prime-v1,LV
H_ZC-11-292_GCACTCTTCTCGTATT,male,Endocardium,H_ZC-11-292,68.0,Healthy,Nuclei,Koenig 2022,5prime-v1,LV


Make cell types consistent

In [11]:
set(adata.obs.cell_type)

{'Adipocytes',
 'B-Cells',
 'Cardiomyocytes',
 'Endocardium',
 'Endothelium',
 'Epicardium',
 'Fibroblasts',
 'Lymphatic',
 'Mast',
 'Myeloid',
 'NK/T-Cells',
 'Neurons',
 'Pericytes',
 'Smooth_Muscle'}

In [12]:
adata.obs['consistent_cell_type'] = adata.obs['cell_type'].map(
    {
        "Adipocytes": "Adipocyte",
        "B-Cells": "Lymphoid",
        "Cardiomyocytes": "Cardiomyocyte",
        "Endocardium": "Endocardial",
        "Endothelium": "Endothelial",
        "Epicardium": "Epicardial",
        "Fibroblasts": "Fibroblast",
        "Lymphatic": "LEC",
        "Mast": "Mast",
        "Myeloid": "Myeloid",
        "NK/T-Cells": "Lymphoid",
        "Neurons": "Neuronal",
        "Pericytes": "Pericyte",
        "Smooth_Muscle": "vSMC"
    }
)

In [13]:
set(adata.obs.consistent_cell_type)

{'Adipocyte',
 'Cardiomyocyte',
 'Endocardial',
 'Endothelial',
 'Epicardial',
 'Fibroblast',
 'LEC',
 'Lymphoid',
 'Mast',
 'Myeloid',
 'Neuronal',
 'Pericyte',
 'vSMC'}

In [14]:
adata.write("03_processed_all_Koenig.h5ad")