In [None]:
import os

import numpy
import pandas
from plotly import offline as plotly
from plotly import graph_objects
from scipy import stats
from statsmodels.stats import multitest

from pepars.plotting import plotting
plotting.init_notebook_mode()

from capblood_seq.dataset import Capblood_Seq_Dataset
from capblood_seq import common as cbs
from capblood_seq import viz as cbs_viz

In [None]:
FDR_THRESHOLDS = [0.1, 0.25]
GENE_ABUNDANCE_FILTER = 0.1

# Whether to pool subjects into one t-test (True) or perform a test on each subject
# separately and then combine via Stouffer's method (False)
POOL_SUBJECTS = False

# Whether to normalize means within each subject - recommend doing this
# if POOL_SUBJECTS is True
NORMALIZE_WITHIN_SUBJECT = False

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = Capblood_Seq_Dataset(data_directory="data", pipeline_name="normalized")
dataset.load()

In [None]:
# We want to compare percentages only relative to the cells we were able to identify
dataset.filter_multi_labeled_cells(cbs.CELL_TYPES)
dataset.filter_genes_by_percent_abundance(GENE_ABUNDANCE_FILTER, any_sample=True)

In [None]:
num_genes = dataset.get_num_genes()
num_cell_types = len(cbs.CELL_TYPES)
print("%s genes after filtering" % num_genes)

In [None]:
# Preload transcript counts into numpy arrays for faster access

cell_type_subject_sample_transcript_counts = {}

for cell_type_index, cell_type in enumerate(cbs.CELL_TYPES + [None]):
    for subject_index, subject_id in enumerate(cbs.SUBJECT_IDS):
        for sample in cbs.SAMPLE_NAMES:
            
            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type,
                subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            transcript_counts = transcript_counts.to_array()
            
            cell_type_subject_sample_transcript_counts[(cell_type, subject_id, sample)] = transcript_counts

In [None]:
hypothesis_p_values = numpy.zeros((num_genes, num_cell_types + 1))
hypothesis_z_scores = numpy.zeros((num_genes, num_cell_types + 1))
num_cells_expressing_gene = numpy.zeros((num_genes, num_cell_types + 1))
num_cells = numpy.zeros((num_genes, num_cell_types + 1))

for cell_type_index, cell_type in enumerate(cbs.CELL_TYPES + [None]):
    
    for gene_index, gene in enumerate(dataset._gene_list):

        if gene_index % 1000 == 0:
            print(gene_index)
            
        subject_p_values = []
        num_samples_per_subject = []
        
        AM_means = []
        PM_means = []

        for subject_index, subject_id in enumerate(cbs.SUBJECT_IDS):
            
            subject_AM_means = []
            subject_PM_means = []
            
            for sample in cbs.SAMPLE_NAMES:
                
                if (cell_type, subject_id, sample) not in cell_type_subject_sample_transcript_counts:
                    continue
                
                transcript_counts = \
                    cell_type_subject_sample_transcript_counts[(cell_type, subject_id, sample)][:, gene_index]
                
                gene_mean = transcript_counts.mean()
                num_cells_expressing_gene[gene_index, cell_type_index] += \
                    transcript_counts[transcript_counts > 0].shape[0]
                num_cells[gene_index, cell_type_index] += transcript_counts.shape[0]
                
                if "AM" in sample:
                    subject_AM_means.append(gene_mean)
                else:
                    subject_PM_means.append(gene_mean)
            
            subject_AM_means = numpy.array(subject_AM_means)
            subject_PM_means = numpy.array(subject_PM_means)
            
            num_samples = len(subject_AM_means) + len(subject_PM_means)
            
            if NORMALIZE_WITHIN_SUBJECT:
                mean_of_means = \
                    (subject_AM_means.mean() * len(subject_PM_means) + \
                    subject_PM_means.mean() * len(subject_AM_means))/num_samples
                subject_AM_means -= mean_of_means
                subject_PM_means -= mean_of_means
            
            if not POOL_SUBJECTS:
                z, p = stats.ttest_ind(subject_AM_means, subject_PM_means)
                if numpy.isnan(z):
                    continue
                num_samples_per_subject.append(num_samples)
                subject_p_values.append(p)
            else:
                AM_means.extend(subject_AM_means)
                PM_means.extend(subject_PM_means)

        if not POOL_SUBJECTS:
            
            if len(subject_p_values) != 0:
                z, p = stats.combine_pvalues(subject_p_values, method="stouffer", weights=num_samples_per_subject)
            else:
                z = numpy.nan
                p = numpy.nan
        else:
            z, p = stats.ttest_ind(AM_means, PM_means)
        
        hypothesis_p_values[gene_index, cell_type_index] = p
        hypothesis_z_scores[gene_index, cell_type_index] = z

In [None]:
# Get just the p values associated with particular cell types
hypothesis_cell_type_p_values = hypothesis_p_values[:, 0:num_cell_types]
hypothesis_cell_type_p_values_unwrapped = hypothesis_cell_type_p_values.reshape((-1, ))
hypothesis_cell_type_p_values_nan_mask = numpy.isnan(hypothesis_cell_type_p_values_unwrapped)

# Get just the non-nan ones for multiple comparison correction
hypothesis_cell_type_p_values_non_nan = hypothesis_cell_type_p_values_unwrapped[~hypothesis_cell_type_p_values_nan_mask]

cell_type_thresholds = []

for threshold in FDR_THRESHOLDS:
    # Multiple comparison correction at FDR = 0.1
    rejected, _, _, _ = multitest.multipletests(hypothesis_cell_type_p_values_non_nan, method="fdr_bh", alpha=threshold)
    p_value_threshold = hypothesis_cell_type_p_values_non_nan[rejected].max()
    cell_type_thresholds.append(hypothesis_z_scores[:, 0:num_cell_types][hypothesis_cell_type_p_values==p_value_threshold][0])

# Fill in any nans with 1 for minimization
hypothesis_cell_type_p_values[numpy.isnan(hypothesis_cell_type_p_values)] = 1

# Get the index of the lowest p-value cell type for each gene
lowest_cell_types = hypothesis_cell_type_p_values.argmin(axis=1)

# Get the associated z score for the lowest p-value
cell_type_z_scores = numpy.array([hypothesis_z_scores[gene_index, lowest_index] for (gene_index, lowest_index) in enumerate(lowest_cell_types)])

In [None]:
hypothesis_cell_type_p_values.shape

In [None]:
hypothesis_cell_type_p_values.min(axis=1).shape

In [None]:
diurnal_genes_df = \
    pandas.DataFrame(
        numpy.concatenate((hypothesis_cell_type_p_values, hypothesis_cell_type_p_values.min(axis=1).reshape((-1,1))), axis=1),
        index=dataset._gene_list,
        columns=cbs.CELL_TYPES + ["min"]
    ).sort_values(by="min", ascending=True)

diurnal_genes_df.to_csv(os.path.join("data", "gene_diurnality.csv"))

In [None]:
# Get just the p values associated with all cells
hypothesis_all_cells_p_values = hypothesis_p_values[:, -1]

all_cells_thresholds = []

for threshold in FDR_THRESHOLDS:
    rejected, _, _, _ = multitest.multipletests(hypothesis_all_cells_p_values, method="fdr_bh", alpha=threshold)
    p_value_threshold = hypothesis_all_cells_p_values[rejected].max()
    all_cells_thresholds.append(hypothesis_z_scores[:, -1][hypothesis_all_cells_p_values==p_value_threshold][0])


# Get the z score associated with this threshold
all_cells_threshold = hypothesis_z_scores[:, -1][hypothesis_all_cells_p_values == p_value_threshold][0]

# Get the associated z score for the lowest p-value
all_cells_z_scores = hypothesis_z_scores[:, -1]

In [None]:
percent_cells_expressing_gene = (num_cells_expressing_gene/num_cells).max(axis=1)

In [None]:
colors = [
    cbs_viz.CELL_TYPE_COLORS[cbs.CELL_TYPES[cell_type_index]] for cell_type_index in lowest_cell_types
]

In [None]:
N=150
min_x = numpy.floor(all_cells_z_scores.min()*4)/4
max_x = numpy.ceil(all_cells_z_scores.max()*4)/4
min_y = numpy.floor(cell_type_z_scores.min()*4)/4
max_y = numpy.ceil(cell_type_z_scores.max()*4)/4

min_x = min(min_x, min_y)
max_x = max(max_x, max_y)
min_y = min_x
max_y = max_x

data = [
    graph_objects.Scatter(
        x=all_cells_z_scores,
        y=cell_type_z_scores,
        mode="markers",
        text=dataset._gene_list,
        marker={
            "size": 2+percent_cells_expressing_gene*7,
            "color": colors,
            "line": {
                "width": 1,
                "color": "rgba(0, 0, 0, 0.5)"
            }
        },
        opacity=0.5
    ),
    graph_objects.Histogram(
        x=all_cells_z_scores,
        name="x density",
        marker=dict(color="red", opacity=0.5),
        yaxis="y2"
    ),
    graph_objects.Histogram(
        y=cell_type_z_scores,
        name="y density",
        marker=dict(color="red", opacity=0.5),
        xaxis="x2"
    )
]

if NORMALIZE_WITHIN_SUBJECT:
    all_cells_thresholds = numpy.abs(all_cells_thresholds)
    all_cells_thresholds = numpy.concatenate((all_cells_thresholds, -all_cells_thresholds))
    cell_type_thresholds = numpy.abs(cell_type_thresholds)
    cell_type_thresholds = numpy.concatenate((cell_type_thresholds, -cell_type_thresholds))
    thresholds = FDR_THRESHOLDS + FDR_THRESHOLDS
else:
    thresholds = FDR_THRESHOLDS
    
for threshold_index, threshold in enumerate(thresholds):
    line = graph_objects.Line(
        x=[all_cells_thresholds[threshold_index], all_cells_thresholds[threshold_index]],
        y=[min_y, max_y],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    )
    
    data.append(line)
    
    line = graph_objects.Line(
        y=[cell_type_thresholds[threshold_index], cell_type_thresholds[threshold_index]],
        x=[min_x, max_x],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    )
    
    data.append(line)

layout = graph_objects.Layout(
    title= "Distribution Significances AM vs PM",
    showlegend=False,
    autosize=False,
    width=850,
    height=850,
    xaxis=dict(
        range=[min_x, max_x],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="All Cells"),
    yaxis=dict(
        range=[min_y, max_y],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="By Cell Type"
    ),
    margin=dict(
        l=20,
        t=100),
    hovermode="closest",
    bargap=0.1,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    plot_bgcolor="rgba(255, 255, 255, 0)",
    paper_bgcolor="rgba(255, 255, 255, 0)"
)

figure = graph_objects.Figure( data=data, layout=layout )

plotly.iplot(figure)

In [None]:
figure.write_image(os.path.join("figures", "diurnal_gene_detection.svg"))
figure.write_html(os.path.join("figures", "diurnal_gene_detection.html"))