In [1]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import scipy.io
from scipy.sparse import csr_matrix
print(ad.__version__)

0.10.9


In [2]:
counts = scipy.io.mmread('data/primary_sparse.mtx').tocsr().transpose()

In [3]:
# Load barcodes and genes
raw_barcodes = pd.read_csv('data/primary_colnames_raw.txt', header=None).squeeze().tolist()
raw_genes = pd.read_csv('data/primary_features_raw.txt', header=None).squeeze().tolist()
processed_genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()

print(f"raw cell number {len(raw_barcodes)}, raw gene number {len(raw_genes)}, the dim of the raw matrix{counts.shape}, the processed gene number {len(processed_genes)}")

raw cell number 156572, raw gene number 25147, the dim of the raw matrix(156572, 25147), the processed gene number 25129


In [4]:
adata = sc.AnnData(X=counts)
adata.var_names = raw_genes
adata.obs_names = raw_barcodes

In [5]:
adata.var_names.intersection(processed_genes)
# this contains the gene only found in the processed data
adata_corrected = adata[:, processed_genes].copy()
len(adata_corrected.var)

25129

In [6]:
metadata = pd.read_table('data/scp_primary_metadata.txt', index_col=0)
# Remove the first row
metadata = metadata.iloc[1:]

cluster_data = pd.read_table('data/primary_clusterdata.txt', index_col=0)
# Remove the first row
cluster_data = cluster_data.iloc[1:]

combined_metadata = pd.merge(cluster_data, metadata, left_index=True, right_index=True)
adata.obs = combined_metadata

adata.obs = adata.obs.astype('category')

columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']
for column in columns_to_convert:
    adata.obs[column] = pd.to_numeric(adata.obs[column])


  metadata = pd.read_table('data/scp_primary_metadata.txt', index_col=0)
  cluster_data = pd.read_table('data/primary_clusterdata.txt', index_col=0)


In [7]:
adata.obs

Unnamed: 0_level_0,Tissue.Region,Cluster.Label,Time.Point,X,Y,Cell.Type,biosample_id,donor_id,species,species__ontology_label,...,organ,organ__ontology_label,organ_custom,library_preparation_protocol,library_preparation_protocol__ontology_label,cell_type,cell_type__ontology_label,cell_type_custom,number_of_reads,number_of_features
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
D14_OE.AAACCCAGTATTCCTT,OM,MT-Hi B,14 dpi,-11.398092,-1.642075,5,D14_OM,D14_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,OM,EFO_0009922,10x 3' v3,CL:0000236,B cell,MT-Hi B,1762,966
D14_OE.AAACGAACAAAGCTCT,OM,Nme1/Nme2 Hi B,14 dpi,-13.586423,-4.545745,5,D14_OM,D14_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,OM,EFO_0009922,10x 3' v3,CL:0000236,B cell,Nme1/Nme2 Hi B,2175,969
D14_OE.AAACGAAGTGTTAAAG,OM,Immature B,14 dpi,-11.207908,-1.848815,5,D14_OM,D14_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,OM,EFO_0009922,10x 3' v3,CL:0000236,B cell,Immature B,893,573
D14_OE.AAACGCTAGAATACAC,OM,Immature B,14 dpi,-11.215386,-1.674733,5,D14_OM,D14_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,OM,EFO_0009922,10x 3' v3,CL:0000236,B cell,Immature B,1459,843
D14_OE.AAACGCTAGCATCCTA,OM,Mature B 2,14 dpi,-12.391555,-4.034981,5,D14_OM,D14_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,OM,EFO_0009922,10x 3' v3,CL:0000236,B cell,Mature B 2,1210,590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Naive_RE.CTCATGCAGGCTTAGG,RM,HSC,Naive,2.256732,-12.009367,10,Naive_RM,Naive_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,RM,EFO_0009922,10x 3' v3,CL:0000037,hematopoietic stem cell,HSC,14808,3745
Naive_RE.GTATTGGGTGCCGGTT,RM,HSC,Naive,-5.709469,-3.818193,10,Naive_RM,Naive_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,RM,EFO_0009922,10x 3' v3,CL:0000037,hematopoietic stem cell,HSC,11051,2989
Naive_RE.TGGGTTATCGCAATGT,RM,HSC,Naive,2.359610,-11.714253,10,Naive_RM,Naive_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0001826,nasal cavity mucosa,RM,EFO_0009922,10x 3' v3,CL:0000037,hematopoietic stem cell,HSC,8405,2644
Naive_LNG.AGCATCAGTGCCCAGT,LNG,HSC,Naive,-6.198734,-3.709947,10,Naive_LNG,Naive_Group,NCBITaxon_10090,Mus musculus,...,UBERON_0035077,lateral nasal gland,LNG,EFO_0009922,10x 3' v3,CL:0000037,hematopoietic stem cell,HSC,27226,4486


In [17]:
# need to be float for BPCells, otherwise you gonna spend 2hr debugging it
adata_corrected.X = adata_corrected.X.astype(np.float64)

adata_corrected.write('data/flu_raw.h5ad', compression="gzip")


: 

In [16]:
adata_corrected.X.dtype
