In [None]:
import os
import numpy

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from plotly import offline as plotly
from plotly import graph_objects

from capblood_seq import common as cbs
from capblood_seq import viz as cbs_viz
from capblood_seq.dataset import Capblood_Seq_Dataset

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = Capblood_Seq_Dataset(pipeline_name="RPM_SD")
dataset.load()

In [None]:
# For a combined tSNE, we're only interested in cells that have a single subject or cell type label
dataset.filter_unlabeled_cells()
dataset.filter_multi_labeled_cells(cbs.SUBJECT_IDS)
dataset.filter_multi_labeled_cells(cbs.CELL_TYPES)

In [None]:
# For the combined tSNE, we want to get all cells across all samples and store their
# originating sample and subject for plotting

# A list of the sample labels for each cell
cell_sample_labels = []

# A list of the subject labels for each cell
cell_subject_labels = []

# A matrix containing the gene expression across all samples
combined_cell_transcript_counts = numpy.zeros((0,dataset.get_num_genes()))

for sample in cbs.SAMPLE_NAMES:
    
    # For each sample and subject, we get their gene count matrix (normalized)
    
    for subject_id in cbs.SUBJECT_IDS:
        
        cell_transcript_counts = dataset.get_transcript_counts(sample, subject_id=subject_id, normalized=True)
        
        # If this returns None, it means this subject wasn't found in this sample; skipping
        if cell_transcript_counts is None:
            continue

        # Add these cell transcript counts to the combined matrix
        combined_cell_transcript_counts = numpy.concatenate(
            (combined_cell_transcript_counts, cell_transcript_counts.to_array()))
        
        # Since we got cells from this sample and subject, we know they all came from it so can just
        # extend this list
        cell_sample_labels.extend([sample] * cell_transcript_counts.shape[0])
        cell_subject_labels.extend([subject_id] * cell_transcript_counts.shape[0])

# Convert these into numpy arrays for later masking
cell_sample_labels = numpy.array(cell_sample_labels)
cell_subject_labels = numpy.array(cell_subject_labels)

In [None]:
# Transform the normalized gene expression values into PCA space
combined_cell_PCs = PCA(n_components=30).fit_transform(
    combined_cell_transcript_counts
)

In [None]:
# Transform the combined PCA coordinates into t-SNE space
combined_cell_coordinates = TSNE(
    verbose=True,
    perplexity=30,
    n_components=2
).fit_transform(
    combined_cell_PCs
)

In [None]:
# What shape to use for each sample

SAMPLE_MARKERS = {
    "AM1": "triangle-up",
    "PM1": "circle",
    "AM2": "square",
    "PM2": "star",
    "AM3": "cross",
    "PM3": "diamond"
}


# Add a different scatter trace for each sample
sample_traces = []

for sample in cbs.SAMPLE_NAMES:
    
    # Make a boolean mask representing the cells that belong to this sample
    sample_cell_mask = (cell_sample_labels == sample)
    
    # Grab the sample transformed coordinates
    x_values = combined_cell_coordinates[sample_cell_mask, 0]
    y_values = combined_cell_coordinates[sample_cell_mask, 1]
    
    # Grab the associated color for each point in this sample
    subject_colors = [cbs_viz.SUBJECT_ID_COLORS[subject_id] 
                      for subject_id in cell_subject_labels[sample_cell_mask]]

    # Make the plotly scatter trace
    scatter_trace = graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="markers",
        marker={
            "size": 2,
            "color": subject_colors,
            "symbol": SAMPLE_MARKERS[sample]
        },
        name=sample
    )
    
    sample_traces.append(scatter_trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

# Plot it
figure = graph_objects.Figure(data=sample_traces, layout=layout)
plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("figures", "combined_tSNE_labeled_by_sample.svg"))
figure.write_html(os.path.join("figures", "combined_tSNE_labeled_by_sample.html"))

In [None]:
# Repeat for labeling the points by subject

# Add a different scatter trace for each sample
sample_traces = []

for subject_id in cbs.SUBJECT_IDS:
    
    # Make a boolean mask representing the cells that belong to this subject
    subject_id_cell_mask = (cell_subject_labels == subject_id)
    
    # Grab the sample transformed coordinates
    x_values = combined_cell_coordinates[subject_id_cell_mask, 0]
    y_values = combined_cell_coordinates[subject_id_cell_mask, 1]
    
    # Grab the associated marker for each point in this subject
    sample_markers = [SAMPLE_MARKERS[sample] for sample in cell_sample_labels[subject_id_cell_mask]]

    # Make the plotly scatter trace
    scatter_trace = graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="markers",
        marker={
            "size": 2,
            "color": cbs_viz.SUBJECT_ID_COLORS[subject_id],
            "symbol": sample_markers
        },
        name=subject_id
    )
    
    sample_traces.append(scatter_trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

# Plot it
figure = graph_objects.Figure(data=sample_traces, layout=layout)
plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("figures", "combined_tSNE_labeled_by_subject.svg"))
figure.write_html(os.path.join("figures", "combined_tSNE_labeled_by_subject.html"))