In [None]:
import os

import numpy
import pandas
from plotly import offline as plotly
from plotly import graph_objects
from scipy import stats
from statsmodels.stats import multitest

from pepars.plotting import plotting
plotting.init_notebook_mode()

import capblood_seq
from capblood_seq import config

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")

In [None]:
# We want to compare percentages only relative to the cells we were able to identify
dataset.filter_multi_labeled_cells(config.CELL_TYPES)

cell_subtype_labels = ["%s %s" % (cell_subtype, cell_type) for cell_type in config.CELL_SUBTYPES for cell_subtype in config.CELL_SUBTYPES[cell_type]]
dataset.filter_multi_labeled_cells(config.CELL_TYPES)
dataset.filter_multi_labeled_cells(cell_subtype_labels)

dataset.filter_genes_by_percent_abundance(0.1)

In [None]:
# For this plot, we break up any parent cell types into their children
cell_types = []

for cell_type in config.CELL_TYPES:
    if cell_type in config.CELL_SUBTYPES:
        for cell_subtype in config.CELL_SUBTYPES[cell_type]:
            cell_types.append(cell_subtype + " " + cell_type)
    else:
        cell_types.append(cell_type)

In [None]:
num_genes = dataset.get_num_genes()
num_cell_types = len(cell_types)

In [None]:
cell_type_subject_sample_transcript_means = {}

for cell_type_index, cell_type in enumerate(cell_types + [None]):
    for subject_index, subject_id in enumerate(config.SUBJECT_IDS):
        for sample in config.SAMPLE_NAMES:
            
            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type,
                subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            transcript_means = transcript_counts.to_array().mean(axis=0)
            
            cell_type_subject_sample_transcript_means[(cell_type, subject_id, sample)] = transcript_means

In [None]:
hypothesis_p_values = numpy.zeros((num_genes, num_cell_types + 1))
hypothesis_z_scores = numpy.zeros((num_genes, num_cell_types + 1))

for cell_type_index, cell_type in enumerate(cell_types + [None]):
    
    for gene_index, gene in enumerate(dataset._gene_list):

        if gene_index % 1000 == 0:
            print(gene_index)
            
        subject_p_values = []
        num_samples = []

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):
            
            AM_means = []
            PM_means = []
            
            for sample in config.SAMPLE_NAMES:
                
                if (cell_type, subject_id, sample) not in cell_type_subject_sample_transcript_means:
                    continue 
                
                gene_mean = \
                    cell_type_subject_sample_transcript_means[(cell_type, subject_id, sample)][gene_index]
                
                if "AM" in sample:
                    AM_means.append(gene_mean)
                else:
                    PM_means.append(gene_mean)
                    
            z, p = stats.ttest_ind(AM_means, PM_means)
            
            if numpy.isnan(z):
                continue
            
            num_samples.append((len(AM_means) + len(PM_means)))
            subject_p_values.append(p)
        
        if len(subject_p_values) != 0:
            z, p_value_combined = stats.combine_pvalues(subject_p_values, method="stouffer", weights=num_samples)
        else:
            z = numpy.nan
            p_value_combined = numpy.nan
        
        hypothesis_p_values[gene_index, cell_type_index] = p_value_combined
        hypothesis_z_scores[gene_index, cell_type_index] = z
        
        if numpy.isnan(p_value_combined):
            print(gene, cell_type)

In [None]:
hypothesis_cell_type_p_values = hypothesis_p_values[:, 0:num_cell_types]
hypothesis_cell_type_p_values_unwrapped = hypothesis_cell_type_p_values.reshape((-1, ))
hypothesis_cell_type_p_values_nan_mask = numpy.isnan(hypothesis_cell_type_p_values_unwrapped)
hypothesis_cell_type_p_values_non_nan = hypothesis_cell_type_p_values_unwrapped[~hypothesis_cell_type_p_values_nan_mask]

In [None]:
plotting.plot_histogram(hypothesis_cell_type_p_values_non_nan.reshape((-1,)), interactive=True)

In [None]:
rejected, corrected_cell_type_p_values, _, _ = multitest.multipletests(hypothesis_cell_type_p_values_non_nan, method="fdr_bh", alpha=0.10)
p_value_threshold = hypothesis_cell_type_p_values_non_nan[rejected].max()

p_values_by_cell_type = []
cell_type_significant_genes = []

for gene_index, gene in enumerate(dataset._gene_list):
    
    gene_p_values = hypothesis_cell_type_p_values[gene_index]
    gene_p_values[numpy.isnan(gene_p_values)] = 1
    gene_p_value_min = gene_p_values.min()
    
    if gene_p_value_min > p_value_threshold:
        continue
    
    p_values_by_cell_type.append(list(gene_p_values) + [gene_p_value_min])
    
    cell_type_significant_genes.append(gene)

cell_type_significant_genes_df = pandas.DataFrame(
    p_values_by_cell_type,
    index=cell_type_significant_genes,
    columns=cell_types + ["Min"]
).sort_values(by="Min")

cell_type_significant_genes_df

In [None]:
cell_type_significant_genes_df.shape

In [None]:
rejected, corrected_all_cells_p_values, _, _ = multitest.multipletests(hypothesis_p_values[:, -1], method="fdr_bh", alpha=0.1)
all_cells_significant_genes = numpy.array(dataset._gene_list)[rejected]

all_cells_significant_genes_df = pandas.DataFrame(
    hypothesis_p_values[:, -1][rejected],
    index=all_cells_significant_genes,
    columns=["p"]
).sort_values(by="p")

In [None]:
all_cells_significant_genes_df

In [None]:
all_cells_significant_genes_df.shape

In [None]:



N=150
min_x = numpy.floor(all_cells_z_scores.min()*4)/4
max_x = numpy.ceil(all_cells_z_scores.max()*4)/4
min_y = numpy.floor(cell_type_z_scores.min()*4)/4
max_y = numpy.ceil(cell_type_z_scores.max()*4)/4

min_x = min(min_x, min_y)
max_x = max(max_x, max_y)
min_y = min_x
max_y = max_x

data = [
    graph_objects.Scatter(
        x=all_cells_z_scores,
        y=cell_type_z_scores,
        mode="markers",
        text=gene_list,
        marker={
            "size": 2+percent_cells_expressing_gene.max(axis=1)*7,
            "color": colors,
            "line": {
                "width": 1,
                "color": "rgba(0, 0, 0, 0.5)"
            }
        },
        opacity=0.5
    ),
    graph_objects.Line(
        x=[all_cells_significance_threshold, all_cells_significance_threshold],
        y=[min_y, max_y],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    ),
    graph_objects.Line(
        y=[by_cell_type_significance_threshold, by_cell_type_significance_threshold],
        x=[min_x, max_x],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    ),
    graph_objects.Histogram(
        x=all_cells_z_scores,
        name="x density",
        marker=dict(color="red", opacity=0.5),
        yaxis="y2"
    ),
    graph_objects.Histogram(
        y=cell_type_z_scores,
        name="y density",
        marker=dict(color="red", opacity=0.5),
        xaxis="x2"
    )
]

layout = graph_objects.Layout(
    title= "Distribution Significances AM vs PM",
    showlegend=False,
    autosize=False,
    width=850,
    height=850,
    xaxis=dict(
        range=[min_x, max_x],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="All Cells"),
    yaxis=dict(
        range=[min_y, max_y],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="By Cell Type"
    ),
    margin=dict(
        l=20,
        t=100),
    hovermode="closest",
    bargap=0.1,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    plot_bgcolor="rgba(255, 255, 255, 0)",
    paper_bgcolor="rgba(255, 255, 255, 0)"
)

figure = graph_objects.Figure( data=data, layout=layout )

plotly.iplot(figure)