In [None]:
import os

import numpy
import pandas
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import capblood_seq
from capblood_seq import config

import pickle

SEED=1040

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="visualization")

In [None]:
# For the combined tSNE, we want to get all cells across all samples and store their
# originating sample and subject for plotting

# A matrix containing the gene expression across all samples
combined_transcript_counts = []

cell_data = []

for sample_index, sample in enumerate(config.SAMPLE_NAMES):        
    
    if "AM" in sample:
        sample_time_of_day = "AM"
    elif "PM" in sample:
        sample_time_of_day = "PM"
    
    ged = dataset._sample_datasets[sample]
    label_cells = dataset._sample_datasets[sample].get_label_cells()
    
    cell_transcript_counts = ged.get_cell_transcript_counts()
    
    for cell_barcode in cell_transcript_counts.row_names:
        
        subject_id_to_append = None
        
        for subject_id in config.SUBJECT_IDS:
            
            if subject_id in label_cells:
                
                if cell_barcode in label_cells[subject_id]:
                    subject_id_to_append = subject_id
                    break

        cell_types = []
                    
        for cell_type in config.CELL_TYPES:

            if cell_barcode in dataset.get_cells(sample, cell_type=cell_type):
                cell_types.append(cell_type)

            if cell_type in config.CELL_SUBTYPES:
                for cell_subtype in config.CELL_SUBTYPES[cell_type]:
                    cell_subtype_label = "%s %s" % (cell_subtype, cell_type)
                    if cell_barcode in dataset.get_cells(sample, cell_type=cell_subtype_label):
                        cell_types.append(cell_subtype_label)

        cell_types = ";".join(cell_types)

        if subject_id_to_append is not None:
            gender = config.SUBJECT_ID_GENDERS[subject_id_to_append]
        else:
            gender = None
            
        cell_row = (cell_barcode, sample, sample_time_of_day, subject_id_to_append, gender, cell_types)

        cell_data.append(cell_row)
        
    cell_transcript_counts = cell_transcript_counts.to_array()
    
    # Add these cell transcript counts to the combined matrix
    combined_transcript_counts.append(cell_transcript_counts)

combined_transcript_counts = numpy.concatenate(
   combined_transcript_counts, axis=0)

In [None]:
# Initiliaze variationan autoencoder and training parameters
n_epochs = 50
learning_rate = 1e-3
# Save the training weights
latent_pickle_file_name = os.path.join(dataset.data_directory, "Dobreva2020", "dobreva2020_nepoch_%i_lr_%.1e_latent.pickle" % (n_epochs, learning_rate))

# Transform the normalized gene expression values into PCA space
with open(latent_pickle_file_name, 'rb') as latent_pickle_file:
    latent = pickle.load(latent_pickle_file)

In [None]:
# Transform the combined PCA coordinates into t-SNE space
combined_cell_coordinates = TSNE(
    verbose=True,
    perplexity=30,
    n_components=2,
    random_state=SEED
).fit_transform(
    latent
)

In [None]:
cell_data_df = pandas.DataFrame.from_records(
    cell_data,
    columns=["Cell Barcode", "Sample", "Time of Day", "Subject ID", "Gender", "Cell Type(s)"]
)
cell_data_df["t-SNE x"] = combined_cell_coordinates[:, 0]
cell_data_df["t-SNE y"] = combined_cell_coordinates[:, 1]
cell_data_df.to_csv(os.path.join("data", "cell_tSNE_coordinates_metadata.csv"), index=False)