In [None]:
import os

import numpy
import pandas
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import capblood_seq
from capblood_seq import config

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(pipeline_name="visualization")

In [None]:
# For a combined tSNE, we're only interested in cells that have a single subject or cell type label
dataset.filter_unlabeled_cells()
dataset.filter_multi_labeled_cells(config.SUBJECT_IDS)
dataset.filter_multi_labeled_cells(config.CELL_TYPES)

In [None]:
# For the combined tSNE, we want to get all cells across all samples and store their
# originating sample and subject for plotting

# A matrix containing the gene expression across all samples
combined_cell_transcript_counts = numpy.zeros((0,dataset.get_num_genes()))

cell_data = []

for sample in config.SAMPLE_NAMES:        
    
    if "AM" in sample:
        sample_time_of_day = "AM"
    elif "PM" in sample:
        sample_time_of_day = "PM"
    
    # For each sample and subject, we get their gene count matrix (normalized)
    for subject_id in config.SUBJECT_IDS:
        
        cell_transcript_counts = dataset.get_transcript_counts(sample, subject_id=subject_id, normalized=True)
        
        # If this returns None, it means this subject wasn't found in this sample; skipping
        if cell_transcript_counts is None:
            continue
            
        gender = config.SUBJECT_ID_GENDERS[subject_id]
            
        for cell_barcode in cell_transcript_counts.row_names:

            cell_types = []

            for cell_type in config.CELL_TYPES:
                
                if cell_barcode in dataset.get_cells(sample, cell_type=cell_type):
                    cell_types.append(cell_type)
                
                if cell_type in config.CELL_SUBTYPES:
                    for cell_subtype in config.CELL_SUBTYPES[cell_type]:
                        cell_subtype_label = "%s %s" % (cell_subtype, cell_type)
                        if cell_barcode in dataset.get_cells(sample, cell_type=cell_subtype_label):
                            cell_types.append(cell_subtype_label)

            cell_types = ";".join(cell_types)

            cell_row = (cell_barcode, sample, sample_time_of_day, subject_id, gender, cell_types)

            cell_data.append(cell_row)

        # Add these cell transcript counts to the combined matrix
        combined_cell_transcript_counts = numpy.concatenate(
            (combined_cell_transcript_counts, cell_transcript_counts.to_array()))

In [None]:
# Transform the normalized gene expression values into PCA space
combined_cell_PCs = PCA(n_components=30).fit_transform(
    combined_cell_transcript_counts
)

In [None]:
numpy.random.seed(42)

In [None]:
# Transform the combined PCA coordinates into t-SNE space
combined_cell_coordinates = TSNE(
    verbose=True,
    perplexity=30,
    n_components=2
).fit_transform(
    combined_cell_PCs
)

In [None]:
cell_data_df = pandas.DataFrame.from_records(
    cell_data,
    columns=["Cell Barocde", "Sample", "Time of Day", "Subject ID", "Gender", "Cell Type(s)"]
)
cell_data_df["t-SNE x"] = combined_cell_coordinates[:, 0]
cell_data_df["t-SNE y"] = combined_cell_coordinates[:, 1]
cell_data_df.to_csv(os.path.join("data", "cell_tSNE_coordinates_metadata.csv"), index=False)