### Convert the files downloaded from Broad Single Cell Portal to an adata format
#### Filter to just LV cells

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
from collections import Counter
import re
import scanpy.external as sce

### Load in the count matrix
This will take a few minutes to load

In [2]:
%%time
# Load the count matrix, need to perform transpose
adata = sc.read_mtx('AllNuclei_snRNA_counts.mtx').T

CPU times: user 8min 22s, sys: 16.1 s, total: 8min 38s
Wall time: 8min 37s


In [4]:
adata.layers["counts"] = adata.X
adata.layers["counts"].sum(axis = 1)

matrix([[24524.],
        [ 8037.],
        [ 5459.],
        ...,
        [ 4586.],
        [ 4651.],
        [ 3079.]], dtype=float32)

In [5]:
# Load the genes/features
genes = pd.read_csv('AllNuclei_snRNA_counts_rownames.txt', header=None, sep='\t')
adata.var_names = genes[0].values

In [6]:
# Load the barcodes/cell names
barcodes = pd.read_csv('AllNuclei_snRNA_counts_colnames.txt', header=None, sep='\t')
adata.obs_names = barcodes[0].values

In [7]:
metadata = pd.read_csv('AllNuclei_snRNA_metadata.csv', skiprows = [1])
metadata.head()
adata.obs = metadata

In [8]:
adata.obs

Unnamed: 0,NAME,orig_ident,nCount_RNA,nFeature_RNA,labID,procedure,age,gender,echoEF,vers10X,...,donor_id,species,species__ontology_label,disease,disease__ontology_label,organ,organ__ontology_label,library_preparation_protocol,sex,library_preparation_protocol__ontology_label
0,P8_1_AAGCCATGTCGGCTAC-1,P8_1,24524,7957,P8,Redo_Sternomoty_Norwood,0y_2m_3d,M,P8_1,v3,...,P8,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
1,P40_2_GTTGTGAAGCGCCATC-1,P40_2,8037,3749,P40,Heart_Transplant,4y_4m_20d,M,P40_2,v3,...,P40,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
2,P26_1_AGTGCCGAGATAACGT-1,P26_1,5459,3252,P26,TOF_Repai,0y_3m_3d,M,P26_1,v3,...,P26,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
3,P26_1_TCGCACTCATATCGGT-1,P26_1,6277,3356,P26,TOF_Repai,0y_3m_3d,M,P26_1,v3,...,P26,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
4,P8_2_CTAGACATCCCATACC-1,P8_2,21356,7881,P8,Redo_Sternomoty_Norwood,0y_2m_3d,M,P8_2,v3,...,P8,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157288,RV_198_2_ATTCTTGCATCTCCCA-1,RV_198_2,5004,2177,13_198,,11y,M,RV_198_2,v3,...,13_198_RV,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
157289,RV_198_2_CAAGACTGTAATACCC-1,RV_198_2,4733,2048,13_198,,11y,M,RV_198_2,v3,...,13_198_RV,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
157290,RV_198_2_AGTTCGAAGTGCGTCC-1,RV_198_2,4586,1844,13_198,,11y,M,RV_198_2,v3,...,13_198_RV,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3
157291,RV_198_1_GTCGTAACACAAGCCC-1,RV_198_1,4651,1999,13_198,,11y,M,RV_198_1,v3,...,13_198_RV,NCBITaxon_9606,Homo sapiens,MONDO_0005453,congenital heart disease,UBERON_0000948,heart,EFO_0009922,male,10x 3' v3


In [10]:
adata.obs_names = adata.obs['NAME']
adata.obs = adata.obs.drop(columns = "NAME")

In [19]:
# all of the obs_names are unique
Counter(adata.obs_names).most_common()[0]

('P8_1_AAGCCATGTCGGCTAC-1', 1)

#### Remove the RV samples

In [24]:
sum(adata.obs_names.str.startswith("RV_"))

23733

In [27]:
adata = adata[~adata.obs_names.str.startswith("RV_"), :]

In [28]:
adata.write("01_Hill_LV_adata.h5ad")