In [None]:
import os
import numpy

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from plotly import offline as plotly
from plotly import graph_objects

import capblood_seq
from capblood_seq import config

import pickle

SEED=1040

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(pipeline_name="debris_filtered")

In [None]:
config.CELL_TYPES

In [None]:
config.CELL_SUBTYPES

In [None]:
# For the combined tSNE, we want to get all cells across all samples and store their
# originating sample and subject for plotting

# A list of the sample labels for each cell
cell_sample_labels = []

# A list of the subject labels for each cell
cell_subject_labels = []

# A matrix containing the gene expression across all samples
combined_transcript_counts = []

# A list of the cell types for each cell
cell_type_labels = []

for sample_index, sample in enumerate(config.SAMPLE_NAMES):
    
    ged = dataset._sample_datasets[sample]
    label_cells = dataset._sample_datasets[sample].get_label_cells()
    
    cell_transcript_counts = ged.get_cell_transcript_counts()
    
    for cell_barcode in cell_transcript_counts.row_names:
        
        subject_id_to_append = None
        
        for subject_id in config.SUBJECT_IDS:
            
            if subject_id in label_cells:
                
                if cell_barcode in label_cells[subject_id]:
                    subject_id_to_append = subject_id
                    break
                    
        cell_type_to_append = None
                    
        for cell_type in config.CELL_TYPES:
            
            if cell_type in config.CELL_SUBTYPES:
                
                for cell_subtype in config.CELL_SUBTYPES[cell_type]:
                    
                    cell_subtype = "%s %s" % (cell_subtype, cell_type)
                    
                    if cell_barcode in label_cells[cell_type]:
                        cell_type_to_append = cell_type
                        break
            else:
                if cell_barcode in label_cells[cell_type]:
                    cell_type_to_append = cell_type
                    break
        
        cell_subject_labels.append(subject_id_to_append)
        cell_type_labels.append(cell_type_to_append)
        
    cell_transcript_counts = cell_transcript_counts.to_array()
    
    # Since we got cells from this sample and subject, we know they all came from it so can just
    # extend this list
    cell_sample_labels.extend([sample] * cell_transcript_counts.shape[0])
    
    # Add these cell transcript counts to the combined matrix
    combined_transcript_counts.append(cell_transcript_counts)

# Convert these into numpy arrays for later masking
cell_sample_labels = numpy.array(cell_sample_labels)
cell_subject_labels = numpy.array(cell_subject_labels)
cell_type_labels = numpy.array(cell_type_labels)

combined_transcript_counts = numpy.concatenate(
   combined_transcript_counts, axis=0)

In [None]:
# Initiliaze variationan autoencoder and training parameters
n_epochs = 50
learning_rate = 1e-3
# Save the training weights
latent_pickle_file_name = os.path.join("data", "Dobreva2020", "dobreva2020_nepoch_%i_lr_%.1e_latent.pickle" % (n_epochs, learning_rate))

# Transform the normalized gene expression values into PCA space
with open(latent_pickle_file_name, 'rb') as latent_pickle_file:
    latent = pickle.load(latent_pickle_file)

In [None]:
# Transform the combined PCA coordinates into t-SNE space
combined_cell_coordinates = TSNE(
    verbose=True,
    perplexity=30,
    n_components=2,
    random_state=SEED
).fit_transform(
    latent
)

In [None]:
config.SUBJECT_ID_COLORS[None] = "Grey"

In [None]:
# What shape to use for each sample

SAMPLE_MARKERS = {
    "AM1": "triangle-up",
    "PM1": "circle",
    "AM2": "square",
    "PM2": "star",
    "AM3": "cross",
    "PM3": "diamond"
}


# Add a different scatter trace for each sample
sample_traces = []

for sample in config.SAMPLE_NAMES:
    
    # Make a boolean mask representing the cells that belong to this sample
    sample_cell_mask = (cell_sample_labels == sample) & (cell_type_labels != None)
    
    # Grab the sample transformed coordinates
    x_values = combined_cell_coordinates[sample_cell_mask, 0]
    y_values = combined_cell_coordinates[sample_cell_mask, 1]
    
    # Grab the associated color for each point in this sample
    subject_colors = [config.SUBJECT_ID_COLORS[subject_id] 
                      for subject_id in cell_subject_labels[sample_cell_mask]]

    # Make the plotly scatter trace
    scatter_trace = graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="markers",
        marker={
            "size": 3,
            "color": subject_colors,
            "symbol": SAMPLE_MARKERS[sample]
        },
        name=sample
    )
    
    sample_traces.append(scatter_trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

# Plot it
figure = graph_objects.Figure(data=sample_traces, layout=layout)
plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("figures", "combined_tSNE_labeled_by_sample.svg"))
figure.write_html(os.path.join("figures", "combined_tSNE_labeled_by_sample.html"))

In [None]:
# Repeat for labeling the points by subject

# Add a different scatter trace for each sample
sample_traces = []

for subject_id in config.SUBJECT_IDS:
    
    # Make a boolean mask representing the cells that belong to this subject
    subject_id_cell_mask = (cell_subject_labels == subject_id) & (cell_type_labels != None)
    
    # Grab the sample transformed coordinates
    x_values = combined_cell_coordinates[subject_id_cell_mask, 0]
    y_values = combined_cell_coordinates[subject_id_cell_mask, 1]
    
    # Grab the associated marker for each point in this subject
    sample_markers = [SAMPLE_MARKERS[sample] for sample in cell_sample_labels[subject_id_cell_mask]]

    # Make the plotly scatter trace
    scatter_trace = graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="markers",
        marker={
            "size": 2.5,
            "color": config.SUBJECT_ID_COLORS[subject_id]
        },
        name=subject_id
    )
    
    sample_traces.append(scatter_trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

# Plot it
figure = graph_objects.Figure(data=sample_traces, layout=layout)
plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("figures", "combined_tSNE_labeled_by_subject.svg"))
figure.write_html(os.path.join("figures", "combined_tSNE_labeled_by_subject.html"))