In [4]:
"""

Pack the scRNA-seq data using scanpy, prep for scran normalisation

"""

import logging, matplotlib, os, sys
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from anndata import AnnData
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
from rpy2.robjects.packages import importr
plt.rcParams['figure.figsize'] = (8, 8)
sc.settings.verbosity = 3
sc.set_figure_params(dpi=200, dpi_save=200)
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['font.size'] = 10
sc.settings.autoshow = False

import sc_utils

In [6]:
samples = [
    sc_utils.sparsify("wagner2020/ss.C-sec-1.csv.gz",  obs_add={'replicate': "C-sec#1", 'type': 'C-sec'}, csv=True),
    sc_utils.sparsify("wagner2020/ss.C-sec-2.csv.gz",  obs_add={'replicate': "C-sec#2", 'type': 'C-sec'}, csv=True),
    sc_utils.sparsify("wagner2020/ss.GRP-1.csv.gz", obs_add={'replicate': "GRP#1", 'type': 'GRP'}, csv=True),
    sc_utils.sparsify("wagner2020/ss.GRP-2.csv.gz", obs_add={'replicate': "GRP#2", 'type': 'GRP'}, csv=True),
    ]
print('Loaded Samples...')

Started wagner2020/ss.C-sec-1.csv.gz
Sparsifying
Loaded
Done
Started wagner2020/ss.C-sec-2.csv.gz
Sparsifying
Loaded
Done
Started wagner2020/ss.GRP-1.csv.gz
Sparsifying
Loaded
Done
Started wagner2020/ss.GRP-2.csv.gz
Sparsifying
Loaded
Done
Loaded Samples...


In [7]:
# Quick pre-filtering, these should be low, otherwise it can mess up downstream analysis, but also can get rid of trivial uninteresting things

[sc.pp.filter_cells(sam, min_genes=1000) for sam in samples]
[sc.pp.filter_cells(sam, max_counts=200000) for sam in samples]
[sc.pp.filter_cells(sam, min_counts=5000) for sam in samples]

# Do not filter gene here; concatenate joins on the union, so if a gene fails in a single sample, it will also be deleted from all other samples;

filtered out 58 cells that have less than 1000 genes expressed
filtered out 76 cells that have less than 1000 genes expressed
filtered out 2101 cells that have less than 1000 genes expressed
filtered out 2145 cells that have less than 1000 genes expressed
filtered out 4838 cells that have less than 5000 counts
filtered out 4921 cells that have less than 5000 counts
filtered out 2399 cells that have less than 5000 counts
filtered out 2386 cells that have less than 5000 counts


[None, None, None, None]

In [8]:
print('Concatenating')
adata = samples[0].concatenate(samples[1:])

Concatenating


In [9]:
del samples

In [10]:
adata.X = adata.X.astype('float32')

In [11]:
print(adata)

AnnData object with n_obs × n_vars = 21076 × 58389
    obs: 'replicate', 'type', 'n_genes', 'n_counts', 'batch'


In [12]:
print('Total number of cells: {:d}'.format(adata.n_obs))
print('Total number of genes: {:d}'.format(adata.n_vars))

Total number of cells: 21076
Total number of genes: 58389


In [13]:
adata.write('./raw_data.h5ad')

... storing 'replicate' as categorical
... storing 'type' as categorical


In [14]:
oh = open('gene_names.all.tsv', 'w')
for g in adata.var_names:
    oh.write('%s\n' % g)
oh.close()