In [None]:
import os
import random
# To reproduce the same clusterings as published
random.seed(43)

import scrapi

from scrapi.dataset import Gene_Expression_Dataset as GED
from scrapi.dataset import Data_Mode
from scrapi.dataset import Transformation_Method
from scrapi.dataset import Normalization_Method

from capblood_seq_poc import common as cbs

In [None]:
# Estimate the number of cells you expect
MAX_NUM_CELLS = 12000
MIN_NUM_CELLS = 1000

# How many sources of noise with unique transcript count profile you expect
# If you only expect empty droplets, this should be 1. If you expect empty droplets + debris, 2
# For blood, we expect debris, empty droplets, and red blood cells
NUM_SOURCES_OF_NOISE_EXPECTED = 3

# How many cell types you expect at the top level of your hierarchy. For blood, we expect
# at least T Cells, B Cells, Monocytes
NUM_CELL_TYPES_EXPECTED = 4

# How many clusters to try to separate into. Recommend at least num sources of noise, plus
# each major cell type, plus each subtype one level below major. For blood, we do
# noise (2) + cell types (3) + cell subtypes (2*cell types = 6) = 14
MAX_NUM_CLUSTERS = 15

# Throw away genes that have no value greater than or equal to this
MIN_GENE_COUNT = 3

In [None]:
# Now we execute all our preprocessing for each sample

for sample in cbs.SAMPLE_NAMES:
    
    # First we load the dataset we just initialized
    dataset_folder_path = os.path.join("data", sample)
    dataset = GED(dataset_folder_path)
    
    print(dataset.num_cells)
    
    # Next we filter out barcodes determined as noisy based on their threshold count
    # profile. See the Debris Removal notebook for an in-depth example of this
    dataset.filter_noise_barcodes(
        min_num_cells=MIN_NUM_CELLS,
        max_num_cells=MAX_NUM_CELLS,
        num_sources_noise_expected=NUM_SOURCES_OF_NOISE_EXPECTED,
        min_num_cell_types_expected=NUM_CELL_TYPES_EXPECTED
    )
    print(dataset.num_cells)
    
    # Save the workspace at this stage in case we want to work with the unnormalized data
    dataset.save("debris_filtered")
    
    # Filter out any genes that are low count
    dataset.filter_low_transcript_counts(MIN_GENE_COUNT)
    
    # Normalize the transcript counts in a cell by their sum of total transcripts
    dataset.normalize_cells(
        data_mode=Data_Mode.GENE_PROBABILITIES)
    
    # Save the normalized data
    dataset.save("normalized")
    
    # Now we're going to do some additional processing for visualization purposes.
    # First we do a log transform (with an offset). PCA/t-SNE like this better.
    dataset.normalize_genes(
        Normalization_Method.LOG_PLUS_1,
        use_normalized=True,
        parameters=[5000])
    
    # Transform the data through a few dimensionality reduction techniques
    dataset.transform(
        Transformation_Method.PCA, num_dimensions=30,
        use_normalized=True)
    dataset.transform(
        Transformation_Method.NMF, num_dimensions=30,
        use_normalized=True)
    dataset.transform(
        Transformation_Method.SVD, num_dimensions=30,
        use_normalized=True)
    dataset.transform(
        Transformation_Method.TSNE, num_dimensions=2,
        use_normalized=True)
    
    # Save the workspace for use in scrap-viz
    dataset.save("visualization")