In [None]:
import capblood_seq
import scvi
from scvi.dataset import GeneExpressionDataset
import numpy
from sparsedat import wrappers 
import scipy
from scvi.models.vae import VAE
from scvi.inference import UnsupervisedTrainer
from scvi.models.scanvi import SCANVI
import scanpy as sc
from plotly import offline as plotly
from sklearn.manifold import TSNE
from plotly import graph_objects
from umap import UMAP
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import torch
import pickle
import os
import random
import pandas as pd
from scvi import set_seed
import pickle
from scrapi.dataset import Gene_Expression_Dataset as GED
from capblood_seq import config

In [None]:
SEED=1040

In [None]:
capblood_seq_data = capblood_seq.load_dataset("data", pipeline_name="debris_filtered")
# Combine transcript counts from the different samples into one big matrix

cell_sample_index = []
combined_transcript_counts = []

for sample_index,sample in enumerate(config.SAMPLE_NAMES):
    ged = capblood_seq_data._sample_datasets[sample]
    cell_transcript_counts = ged.get_cell_transcript_counts()
    cell_transcript_counts = cell_transcript_counts.to_array()
    combined_transcript_counts.append(cell_transcript_counts)
    cell_sample_index.extend([sample_index]*cell_transcript_counts.shape[0])

combined_transcript_counts = numpy.concatenate(
   combined_transcript_counts, axis=0)

In [None]:
#Populate the gene expression set with data
ged = GeneExpressionDataset()

ged.populate_from_data(
    combined_transcript_counts,
    gene_names=capblood_seq_data.gene_list,
    batch_indices=cell_sample_index
)

In [None]:
# Initiliaze variationan autoencoder and training parameters
n_epochs = 50
learning_rate = 1e-3
# Save the training weights
latent_pickle_file_name = os.path.join("data", "Dobreva2020", "dobreva2020_nepoch_%i_lr_%.1e_latent.pickle" % (n_epochs, learning_rate))
weights_pickle_file_name = os.path.join("data", "Dobreva2020", "dobreva2020_nepoch_%i_lr_%.1e_weights.pickle" % (n_epochs, learning_rate))

In [None]:
vae = VAE(ged.nb_genes,n_batch=ged.n_batches)
trainer=UnsupervisedTrainer(vae,ged,train_size=0.8,frequency=1,seed=SEED)

In [None]:
# If you can't load existing latent space, train!
if not os.path.exists(latent_pickle_file_name):

    set_seed(SEED)
    
    trainer.train(n_epochs=n_epochs, lr=learning_rate)
    torch.save(trainer.model.state_dict(), weights_pickle_file_name)
    
    full = trainer.create_posterior(trainer.model, ged, indices=numpy.arange(len(ged)))
    latent, _, _ = full.sequential().get_latent()
    
    with open(latent_pickle_file_name, 'wb') as latent_pickle_file:
        pickle.dump(latent, latent_pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

else:
    
    weights_pickle_file = torch.load(weights_pickle_file_name)
    trainer.model.load_state_dict(weights_pickle_file)
    
    with open(latent_pickle_file_name, 'rb') as latent_pickle_file:
        latent = pickle.load(latent_pickle_file)

In [None]:
tsne = TSNE(n_components = 2,random_state=SEED).fit_transform(latent)

In [None]:
clusters = AgglomerativeClustering(n_clusters=13).fit_predict(latent)

In [None]:
traces = []

for cluster_index in range(clusters.max()+1):
    
    x = tsne[clusters == cluster_index, 0]
    y = tsne[clusters == cluster_index, 1]
    
    trace = graph_objects.Scatter(
        x=x,
        y=y,
        name="Cluster %i" % cluster_index,
        mode="markers"
    )
    
    traces.append(trace)

figure = graph_objects.Figure(traces)

plotly.iplot(figure)

In [None]:
cluster_cell_marker_map = {}
cluster_cell_marker_map[3] = 'CD4 T Cells'
cluster_cell_marker_map[7] = 'CD4 T Cells'

cluster_cell_marker_map[2] = 'CD8 T Cells'
cluster_cell_marker_map[9] = 'CD8 T Cells'

cluster_cell_marker_map[4] = 'NK Cells'
cluster_cell_marker_map[5] = 'NK Cells'

cluster_cell_marker_map[11] = 'B Cells'
cluster_cell_marker_map[12] = 'B Cells'

cluster_cell_marker_map[0] = 'CD14 Monocytes'
cluster_cell_marker_map[10] = 'CD14 Monocytes'

cluster_cell_marker_map[6] = 'CD16 Monocytes'

cluster_cell_marker_map[8] = 'Dendritic Cells'


cluster_cell_marker_map

In [None]:
cell_index = 0

for sample_index, sample in enumerate(config.SAMPLE_NAMES):
    
    ged = capblood_seq_data._sample_datasets[sample]
    
    num_cells = ged.num_cells
    
    # Rows are cells
    cell_barcodes = numpy.array(ged._cell_transcript_counts.row_names)
    
    for cluster, label in cluster_cell_marker_map.items():
        
        cluster_mask = clusters == cluster
        cluster_mask = cluster_mask[cell_index:cell_index+num_cells]
        
        label_barcodes = cell_barcodes[cluster_mask]
        label = cluster_cell_marker_map[cluster]
        
        ged.label_cells(label, label_barcodes)
        
    ged.save_labels()
    
    cell_index += num_cells

In [None]:
GENE = "COX5A"

gene_index = capblood_seq_data.gene_list.index(GENE)

traces = []
    
x = tsne[:, 0]
y = tsne[:, 1]

trace = graph_objects.Scatter(
    x=x,
    y=y,
    name="Cluster %i" % cluster_index,
    mode="markers",
    marker={
        "color": combined_transcript_counts[:, gene_index].flatten()
    }
)

traces.append(trace)

figure = graph_objects.Figure(traces)

plotly.iplot(figure)

In [None]:
def create_mask(index_array,num_elements):
    mask_array = numpy.zeros(num_elements, dtype=int)
    mask_array[index_array] = 1
    return mask_array

cell_types = ['B Cells','CD4 T Cells','CD8 T Cells','Dendritic Cells','CD14 Monocytes','CD16 Monocytes','Neutrophils','NK Cells','T Cells','Monocytes']
lee_cell_stat_pd = pd.DataFrame(columns=cell_types)
lee_cell_stat_pd["Subject"] = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24']
lee_cell_stat_pd = lee_cell_stat_pd.set_index("Subject")
for col in lee_cell_stat_pd.columns:
    lee_cell_stat_pd[col].values[:] = 0

combined_batch_indices = cell_sample_index
cell_index_offset = 0
for cluster in range(clusters.max() + 1):
    subject_batch_counter = 0
    cluster_mask = clusters == cluster
    num_cells = cluster_mask.sum()

    cell_index_offset = 0
    
    for batch_index in range(max(combined_batch_indices)+1):

        sample = test_samples[batch_index]

        ged = capblood_seq_data._sample_datasets[sample]

        batch_mask = numpy.array(combined_batch_indices) == batch_index
        batch_cell_mask = batch_mask & cluster_mask
        total_batch_num_cells = batch_mask.sum()
        num_cells_cluster = batch_cell_mask.sum()

        for subject_id in capblood_seq.config.SUBJECT_IDS:
            
            try:
                subject_cell_barcodes = ged.get_cells(subject_id)
            except:
                subject_batch_counter += 1
                continue

            subject_sample_cell_indices = [ged._cell_transcript_counts.row_names.index(cell_barcode) for cell_barcode in subject_cell_barcodes]

            subject_cell_indices = [x + cell_index_offset for x in subject_sample_cell_indices]

            subject_batch_mask = create_mask(subject_cell_indices,len(batch_mask))


            # number of cells in this batch for this person
            num_subject_batch_mask = subject_batch_mask.sum()

            #Get mask for subject,cluster
            subject_batch_cluster_mask = batch_cell_mask & subject_batch_mask

            #number of cells in this batch,this subject
            num_subject_batch_cluster_mask = subject_batch_cluster_mask.sum()

            percentage_of_cells = num_subject_batch_cluster_mask*100/num_subject_batch_mask

            print("Cluster: ")
            print(cluster_cell_marker_map[cluster])
            print(subject_batch_counter)

            if 'B Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['B Cells'].iloc[subject_batch_counter] += percentage_of_cells
            if 'CD4 T Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['CD4 T Cells'].iloc[subject_batch_counter] += percentage_of_cells
            if 'CD8 T Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['CD8 T Cells'].iloc[subject_batch_counter] += percentage_of_cells
            if 'Dendritic Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['Dendritic Cells'].iloc[subject_batch_counter] += percentage_of_cells
            if 'CD14 Monocytes' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['CD14 Monocytes'].iloc[subject_batch_counter] += percentage_of_cells
            if 'CD16 Monocytes' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['CD16 Monocytes'].iloc[subject_batch_counter] += percentage_of_cells
            if 'Neutrophils' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['Neutrophils'].iloc[subject_batch_counter] += percentage_of_cells
            if 'NK Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['NK Cells'].iloc[subject_batch_counter] += percentage_of_cells
            if ('Monocytes' or 'Dendritic Cells') in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['Monocytes'].iloc[subject_batch_counter] += percentage_of_cells
            if 'T Cells' in cluster_cell_marker_map[cluster]:
                lee_cell_stat_pd['T Cells'].iloc[subject_batch_counter] += percentage_of_cells

            subject_batch_counter += 1

    
        cell_index_offset += ged.num_cells
            
lee_cell_stat_pd

In [None]:
lee_cell_stat_pd
