In [None]:
import capblood_seq
import scvi
from scipy import sparse
from scvi.dataset import GeneExpressionDataset
import numpy
from sparsedat import wrappers 
import scipy
from scvi.models.vae import VAE
from scvi.inference import UnsupervisedTrainer
from scvi.models.scanvi import SCANVI
import scanpy as sc
from plotly import offline as plotly
from sklearn.manifold import TSNE
from plotly import graph_objects
from umap import UMAP
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
import torch
import pickle
import os
import random
import pandas as pd
from scvi import set_seed
import pickle
from scrapi.dataset import Gene_Expression_Dataset as GED
from capblood_seq import config

from sparsedat import Sparse_Data_Table as SDT
import anndata

import h5py
import pandas

SEED=1040

In [None]:
sample_transcript_counts = []
sample_names = []
sample_gene_names = []
cell_sample_names = []
cell_barcodes = []

for sample in config.SAMPLE_NAMES:
    
    sample_name = "%s-%s" % ("Dobreva2020", sample)
    
    debris_filtered_sdt = SDT(os.path.join("data", sample, "cell_transcript_counts_debris_filtered.sdt"))
    
    h5_file = h5py.File(os.path.join("data", sample, "raw_feature_bc_matrix.h5"), "r")

    gene_names = [x.decode("UTF-8")
                  for x in list(h5_file["matrix"]["features"]["name"])]
    gene_ids = [x.decode("UTF-8")
                for x in list(h5_file["matrix"]["features"]["id"])]

    gene_name_indices = {}

    disambiguated_gene_names = []
    gene_name_ensembl_id_lookup = {}

    for gene_index, gene in enumerate(gene_names):
        if gene not in gene_name_indices:
            gene_name_indices[gene] = [gene_ids[gene_index]]
        else:
            gene_name_indices[gene].append(gene_ids[gene_index])

    for gene_index, gene in enumerate(gene_names):

        if len(gene_name_indices[gene]) > 1:
            # Figure out which gene this is, as sorted by id
            this_gene_id = gene_ids[gene_index]
            duplicate_gene_index = sorted(gene_name_indices[gene]).index(
                this_gene_id)
            disambiguated_gene_name = "%s_%i" % (gene, duplicate_gene_index + 1)
            disambiguated_gene_names.append(disambiguated_gene_name)
            gene_name_ensembl_id_lookup[disambiguated_gene_name] = this_gene_id
        else:
            disambiguated_gene_names.append(gene)
            this_gene_id = gene_ids[gene_index]
            gene_name_ensembl_id_lookup[gene] = this_gene_id
            
    debris_filtered_sdt.column_names = [gene_name_ensembl_id_lookup[gene] for gene in debris_filtered_sdt.column_names]
    
    debris_filtered_transcript_counts = wrappers.to_csr(debris_filtered_sdt)
    
    sample_transcript_counts.append(debris_filtered_transcript_counts)
    sample_names.append(sample_name)
    sample_gene_names.append(debris_filtered_sdt.column_names)
    cell_sample_names.extend([sample_name] * debris_filtered_sdt.num_rows)
    cell_barcodes.extend(debris_filtered_sdt.row_names)

In [None]:
hashimoto_barcodes = pd.read_csv(os.path.join('data', 'Hashimoto2019', 'cell_barcodes.txt'), sep = '\t',header=None)
hashimoto_barcodes = hashimoto_barcodes.drop([2],axis=1)
hashimoto_barcodes.columns = ['barcode','sample_id']

hashimoto_mtx = anndata.read_mtx(os.path.join("data", "Hashimoto2019", "hashimoto.mtx"))
hashimoto_mtx = hashimoto_mtx.X
hashimoto_genes = pd.read_csv(os.path.join("data", "Hashimoto2019", "genes.txt"), sep = '\t',header=None)

hashimoto_sample_ids = [
    "CT1",
    "CT2",
    "CT3",
    "CT4",
    "CT5"
]

for subject_index, sample_id in enumerate(hashimoto_sample_ids):
    
    sample_name = "%s-%s" % ("Hashimoto2019", sample_id)
    
    subject_mask = hashimoto_barcodes["sample_id"] == sample_id
    subject_data = hashimoto_mtx[subject_mask.values, :]
    
    sample_transcript_counts.append(subject_data)
    sample_names.append(sample_name)
    sample_gene_names.append(list(hashimoto_genes.values[:, 0]))
    cell_sample_names.extend([sample_name] * subject_data.shape[0])
    cell_barcodes.extend(hashimoto_barcodes["barcode"][subject_mask].values)

In [None]:
hu_dataset = sc.read_h5ad(os.path.join("data", "Hu2019", "hu_smith.h5ad"))
hu_obs = hu_dataset.obs
hu_gene_df = hu_dataset.var
hu_mtx = hu_dataset.X

control_subject_descriptors = [
    #S1: 21
    'Pre-THC-S1',
    #S2: 21
    'Pre-THC-S2'
]

for subject_index, subject_descriptor in enumerate(control_subject_descriptors):
    
    sample_name = "%s-%s" % ("Hu2019", subject_descriptor)
    
    subject_mask = (hu_obs['study']=='hu') & (hu_obs['sample_description']==subject_descriptor)
    subject_data = hu_mtx[subject_mask.values, :]
    
    sample_transcript_counts.append(subject_data)
    sample_names.append(sample_name)
    sample_gene_names.append(list(hu_gene_df.values[:, 0]))
    cell_sample_names.extend([sample_name] * subject_data.shape[0])
    cell_barcodes.extend(hu_obs["barcode"][subject_mask].values)

In [None]:
gene_name_ensembl_lookup = {
    row[1][1]: row[1][0] for row in hu_gene_df.iterrows()
}
ensemble_gene_name_lookup = {
    row[1][0]: row[1][1] for row in hu_gene_df.iterrows()
}

In [None]:
# Extract healthy venous blood PBMC data from study
lee_dataset = sc.read_h5ad(os.path.join("data", "Lee2020", "lee_GSE149689.h5ad"))

lee_obs = lee_dataset.obs

lee_gene_df = lee_dataset.var

lee_mtx = lee_dataset.X

control_subject_descriptors = [
    #S1, age: 63, female
    'Sample 5_Normal 1 scRNA-seq [SW107]',
    #S2, age: 54, female
    'Sample 13_Normal 2 scRNA-seq [SW115]',
    #S3, age: 67, female
    'Sample 14_Normal 3 scRNA-seq [SW116]',
    #S4, age: 64, male
    'Sample 19_Normal 4 scRNA-seq [SW121]'
]

for subject_index, subject_descriptor in enumerate(control_subject_descriptors):
    
    sample_name = "%s-%s" % ("Lee2020", subject_descriptor)
    
    subject_mask = (lee_obs['sample_description']==subject_descriptor)
    subject_data = lee_mtx[subject_mask.values,:]
    
    sample_transcript_counts.append(subject_data)
    sample_names.append(sample_name)
    sample_gene_names.append(list(lee_gene_df.values[:, 0]))
    cell_sample_names.extend([sample_name] * subject_data.shape[0])
    cell_barcodes.extend(lee_obs["barcode"][subject_mask].values)

In [None]:
ensembl_id_intersection = None

for sample_index, transcript_counts in enumerate(sample_transcript_counts):
    
    if ensembl_id_intersection is None:
        ensembl_id_intersection = set(sample_gene_names[sample_index])
    else:
        ensembl_id_intersection = ensembl_id_intersection.intersection(sample_gene_names[sample_index])

ensembl_id_intersection = list(ensembl_id_intersection)

In [None]:
aligned_sample_transcript_counts = []
combined_batch_indices = []

for sample_index, transcript_counts in enumerate(sample_transcript_counts):
    
    gene_name_index = {gene_name: index for index, gene_name in enumerate(sample_gene_names[sample_index])}
    
    gene_indices = []
    
    for gene in ensembl_id_intersection:
        gene_indices.append(gene_name_index[gene])
    
    aligned_sample_transcript_counts.append(transcript_counts[:, gene_indices])
    
    combined_batch_indices.extend([sample_index]*transcript_counts.shape[0])

aligned_sample_transcript_counts = sparse.vstack(aligned_sample_transcript_counts)

In [None]:
ged = GeneExpressionDataset()

ged.populate_from_data(
    aligned_sample_transcript_counts,
    gene_names=ensembl_id_intersection,
    batch_indices=combined_batch_indices
)

In [None]:
# Initiliaze variationan autoencoder and training parameters
n_epochs = 50
learning_rate = 1e-3
# Save the training weights
latent_pickle_file_name = os.path.join("data", "VenousVsCapillary", "nepoch_%i_lr_%.1e_latent.pickle" % (n_epochs, learning_rate))
weights_pickle_file_name = os.path.join("data", "VenousVsCapillary", "nepoch_%i_lr_%.1e_weights.pickle" % (n_epochs, learning_rate))
tsne_pickle_file_name = os.path.join("data", "VenousVsCapillary", "nepoch_%i_lr_%.1e_tsne.pickle" % (n_epochs, learning_rate))
clusters_pickle_file_name = os.path.join("data", "VenousVsCapillary", "nepoch_%i_lr_%.1e_clusters.pickle" % (n_epochs, learning_rate))

In [None]:
vae = VAE(ged.nb_genes,n_batch=ged.n_batches)
trainer=UnsupervisedTrainer(vae,ged,train_size=0.8,frequency=1,seed=SEED)

In [None]:
# If you can't load existing latent space, train!
if not os.path.exists(latent_pickle_file_name):

    set_seed(SEED)
    
    trainer.train(n_epochs=n_epochs, lr=learning_rate)
    torch.save(trainer.model.state_dict(), weights_pickle_file_name)
    
    full = trainer.create_posterior(trainer.model, ged, indices=numpy.arange(len(ged)))
    latent, _, _ = full.sequential().get_latent()
    
    with open(latent_pickle_file_name, 'wb') as latent_pickle_file:
        pickle.dump(latent, latent_pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

else:
    
    weights_pickle_file = torch.load(weights_pickle_file_name)
    trainer.model.load_state_dict(weights_pickle_file)
    
    full = trainer.create_posterior(trainer.model, ged, indices=numpy.arange(len(ged)))
    
    with open(latent_pickle_file_name, 'rb') as latent_pickle_file:
        latent = pickle.load(latent_pickle_file)

In [None]:
if not os.path.exists(tsne_pickle_file_name):
    tsne = TSNE(n_components = 2,random_state=SEED).fit_transform(latent)
    
    with open(tsne_pickle_file_name, 'wb') as tsne_pickle_file:
        pickle.dump(tsne, tsne_pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(tsne_pickle_file_name, 'rb') as tsne_pickle_file:
        tsne = pickle.load(tsne_pickle_file)

In [None]:
if not os.path.exists(clusters_pickle_file_name):
    clusters = AgglomerativeClustering(n_clusters=13).fit_predict(latent)
    
    with open(clusters_pickle_file_name, 'wb') as clusters_pickle_file:
        pickle.dump(clusters, clusters_pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(clusters_pickle_file_name, 'rb') as clusters_pickle_file:
        clusters = pickle.load(clusters_pickle_file)

In [None]:
traces = []

for cluster_index in range(clusters.max()+1):
    
    x = tsne[clusters == cluster_index, 0]
    y = tsne[clusters == cluster_index, 1]
    
    trace = graph_objects.Scatter(
        x=x,
        y=y,
        name="Cluster %i" % cluster_index,
        mode="markers",
        marker={
            "size": 3
        }
    )
    
    traces.append(trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

figure = graph_objects.Figure(traces, layout=layout)

plotly.iplot(figure)

In [None]:
cluster_cell_marker_map = {}

cluster_cell_marker_map[3] = "B Cells"

cluster_cell_marker_map[0] = 'CD14 Monocytes'
cluster_cell_marker_map[7] = 'CD16 Monocytes'

cluster_cell_marker_map[9] = 'NK Cells'

cluster_cell_marker_map[1] = 'CD8 T Cells'
cluster_cell_marker_map[11] = 'CD8 T Cells'

cluster_cell_marker_map[5] = 'CD4 T Cells'
cluster_cell_marker_map[2] = 'CD4 T Cells'


# cluster_cell_marker_map[6] = 'Neutrophils'
# cluster_cell_marker_map[12] = 'Red Blood Cells'
# cluster_cell_marker_map[8] = 'Dendritic Cells'
# cluster_cell_marker_map[4] = 'Unknown'
# cluster_cell_marker_map[10] = 'Unknown'

In [None]:
traces = []

for cluster_index in range(clusters.max()+1):
    
    if cluster_index not in cluster_cell_marker_map:
        continue
    
    x = tsne[clusters == cluster_index, 0]
    y = tsne[clusters == cluster_index, 1]
    
    trace = graph_objects.Scatter(
        x=x,
        y=y,
        name="Cluster %i" % cluster_index,
        mode="markers",
        marker={
            "size": 3
        }
    )
    
    traces.append(trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

figure = graph_objects.Figure(traces, layout=layout)

plotly.iplot(figure)
# Save for publication!

figure.write_image(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_cluster.svg"))
figure.write_html(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_cluster.html"))

In [None]:
capillary_blood_cells = numpy.char.startswith(cell_sample_names, "Dobreva2020")
venous_blood_cells = ~capillary_blood_cells

In [None]:
traces = []

for study_name in ["Dobreva2020", "Hashimoto2019", "Lee2020", "Hu2019"]:
    
    sample_cells = numpy.char.startswith(cell_sample_names, study_name)

    trace = graph_objects.Scatter(
        x=tsne[sample_cells, 0],
        y=tsne[sample_cells, 1],
        name=study_name,
        mode="markers",
        marker={
            "size": 3
        }
    )
    
    traces.append(trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

figure = graph_objects.Figure(traces, layout=layout)

plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_study.svg"))
figure.write_html(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_study.html"))

In [None]:
traces = []

for sample_name in sample_names:
    
    sample_cells = numpy.array(cell_sample_names) == sample_name

    trace = graph_objects.Scatter(
        x=tsne[sample_cells, 0],
        y=tsne[sample_cells, 1],
        name=sample_name,
        mode="markers",
        marker={
            "size": 3
        }
    )
    
    traces.append(trace)

figure = graph_objects.Figure(traces)

plotly.iplot(figure)

In [None]:
traces = []

trace = graph_objects.Scatter(
    x=tsne[capillary_blood_cells, 0],
    y=tsne[capillary_blood_cells, 1],
    name="Capillary Blood (n=22)",
    mode="markers",
    marker={
        "size": 2
    }
)

traces.append(trace)

trace = graph_objects.Scatter(
    x=tsne[venous_blood_cells, 0],
    y=tsne[venous_blood_cells, 1],
    name="Venous Blood (n=11)",
    mode="markers",
    marker={
        "size": 2
    }
)

traces.append(trace)

# Blank the background
layout = graph_objects.Layout(
    {
        "plot_bgcolor": "rgba(255, 255, 255, 0)",
        "paper_bgcolor": "rgba(255, 255, 255, 0)"
    }
)

figure = graph_objects.Figure(traces, layout=layout)

plotly.iplot(figure)

# Save for publication!
figure.write_image(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_blood_type.svg"))
figure.write_html(os.path.join("data", "VenousVsCapillary", "combined_tSNE_labeled_by_blood_type.html"))

In [None]:
excel_file = pandas.ExcelWriter(os.path.join("data", "VenousVsCapillary", "differential_expression.xlsx"), engine="xlsxwriter")

for cluster, cell_type in sorted(cluster_cell_marker_map.items(), key=lambda x: x[0]):

    print(cluster)
    
    cluster_capillary_cells = (clusters == cluster) & (capillary_blood_cells)
    cluster_venous_cells = (clusters == cluster) & (venous_blood_cells)
    
    if cluster_capillary_cells.sum() == 0 or cluster_venous_cells.sum() == 0:
        continue
    
    de_df = full.differential_expression_score(
        cluster_capillary_cells,
        cluster_venous_cells,
        mode="change"
    )
    
    output_df = pandas.DataFrame(index=de_df.index, columns=["Gene", "Bayes Factor", "Probability of DE", "Cluster Mean", "Non Cluster Mean"])
    
    output_df["Gene"] = pandas.Series(ensemble_gene_name_lookup)
    output_df["Bayes Factor"] = de_df["bayes_factor"]
    output_df["Probability of DE"] = de_df["proba_de"]
    output_df["Cluster Mean"] = de_df["raw_mean1"]
    output_df["Non Cluster Mean"] = de_df["raw_mean2"]
    
    output_df = output_df.sort_values(by="Bayes Factor", key=lambda x: numpy.abs(x), ascending=False)
    
    output_df.to_excel(excel_file, sheet_name="Cluster %i - %s" % (cluster, cell_type))
    
for cell_type in set(cluster_cell_marker_map.values()):
    
    print(cell_type)
    
    cell_type_cells = clusters == -1
    
    for cluster in cluster_cell_marker_map:
        
        if cluster_cell_marker_map[cluster] == cell_type:
            cell_type_cells = cell_type_cells | (clusters == cluster)
    
    cell_type_capillary_cells = (cell_type_cells) & (capillary_blood_cells)
    cell_type_venous_cells = (cell_type_cells) & (venous_blood_cells)
    
    if cell_type_capillary_cells.sum() == 0 or cell_type_venous_cells.sum() == 0:
        continue
    
    de_df = full.differential_expression_score(
        cell_type_capillary_cells,
        cell_type_venous_cells,
        mode="change"
    )
    
    output_df = pandas.DataFrame(index=de_df.index, columns=["Gene", "Bayes Factor", "Probability of DE", "Cluster Mean", "Non Cluster Mean"])
    
    output_df["Gene"] = pandas.Series(ensemble_gene_name_lookup)
    output_df["Bayes Factor"] = de_df["bayes_factor"]
    output_df["Probability of DE"] = de_df["proba_de"]
    output_df["Cluster Mean"] = de_df["raw_mean1"]
    output_df["Non Cluster Mean"] = de_df["raw_mean2"]
    
    output_df = output_df.sort_values(by="Bayes Factor", ascending=False)
    
    output_df.to_excel(excel_file, sheet_name="%s" % (cell_type))
    

de_df = full.differential_expression_score(
    capillary_blood_cells,
    venous_blood_cells,
    mode="change"
)

output_df = pandas.DataFrame(index=de_df.index, columns=["Gene", "Bayes Factor", "Probability of DE", "Cluster Mean", "Non Cluster Mean"])

output_df["Gene"] = pandas.Series(ensemble_gene_name_lookup)
output_df["Bayes Factor"] = de_df["bayes_factor"]
output_df["Probability of DE"] = de_df["proba_de"]
output_df["Cluster Mean"] = de_df["raw_mean1"]
output_df["Non Cluster Mean"] = de_df["raw_mean2"]

output_df = output_df.sort_values(by="Bayes Factor", ascending=False)

output_df.to_excel(excel_file, sheet_name="Capillary vs Venous")

excel_file.save()

In [None]:
# # Note: Uncomment below to explore prominence of a marker in a cluster

GENE = "HBB"
gene_index = ensembl_id_intersection.index(gene_name_ensembl_lookup[GENE])

traces = []
    
x = tsne[:, 0]
y = tsne[:, 1]

trace = graph_objects.Scattergl(
    x=x,
    y=y,
    name="Cluster %i" % cluster_index,
    mode="markers",
    marker={
        "color": aligned_sample_transcript_counts[:, gene_index].toarray().flatten(),
        "size": 5

    })

traces.append(trace)

figure = graph_objects.Figure(traces)

plotly.iplot(figure)