In [None]:
import os
import numpy
from plotly import offline as plotly
from plotly import graph_objects
from scipy import stats
from statsmodels.stats import multitest

from capblood_seq_poc.dataset import Capblood_Seq_Dataset
from capblood_seq_poc import common as cbs
from capblood_seq_poc import viz as cbs_viz

In [None]:
CELL_TYPE = "CD8 T Cells"

# We hypothesize that AM cells distributions are greater than PM
# Options are: greater, less, or 2-sided
HYPOTHESIS = "greater"

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = Capblood_Seq_Dataset(data_directory="data", pipeline_name="debris_filtered")
dataset.load()

In [None]:
dataset.filter_unlabeled_cells(cbs.SUBJECT_IDS)

dataset.filter_multi_labeled_cells(cbs.SUBJECT_IDS)

# We want to compare percentages only relative to the cells we were able to identify
if CELL_TYPE in cbs.CELL_TYPES:
    dataset.filter_multi_labeled_cells(cbs.CELL_TYPES)

# If you are doing a cell subtype, filter these out as well:
else:
    cell_subtype_labels = ["%s %s" % (cell_type, cell_subtype) for cell_type in cbs.CELL_SUBTYPES for cell_subtype in cbs.CELL_SUBTYPES[cell_type]]
    dataset.filter_multi_labeled_cells(cell_subtype_labels)

In [None]:
# Array of the cell type ratios
AM_cell_ratios = []
num_AM_samples = 0
PM_cell_ratios = []
num_PM_samples = 0
AM_marker_colors = []
PM_marker_colors = []
AM_text_labels = []
PM_text_labels = []

p_values = []
num_samples = []

for subject_id in cbs.SUBJECT_IDS:
    
    subject_AM_cell_ratios = []
    subject_PM_cell_ratios = []
    
    for sample in cbs.SAMPLE_NAMES:
        
        # Get the subject's number of cells of this cell type and all cells
        num_cell_type_cells = dataset.get_num_cells(sample, subject_id=subject_id, cell_type=CELL_TYPE)
        num_cells = dataset.get_num_cells(sample, subject_id=subject_id)
        
        # Skip subjects that don't have cells in this sample
        if not num_cells:
            continue
            
        # Save it to the appropriate time of day array
        if "AM" in sample:
            subject_AM_cell_ratios.append(num_cell_type_cells/num_cells)
            num_AM_samples += 1
            
            # Color subjects by the common color map
            AM_marker_colors.append(cbs_viz.SUBJECT_ID_COLORS[subject_id])
            AM_text_labels.append("%s<br>%s" %(subject_id, sample))
        else:
            if sample == "PM1":
                pass
                #num_cell_type_cells = dataset.get_num_cells(sample, subject_id=subject_id, cell_type="CD4 T Cells")
            subject_PM_cell_ratios.append(num_cell_type_cells/num_cells)
            num_PM_samples += 1
            
            # Color subjects by the common color map
            PM_marker_colors.append(cbs_viz.SUBJECT_ID_COLORS[subject_id])
            PM_text_labels.append("%s<br>%s" %(subject_id, sample))
        print(subject_id, sample, num_cells)
    z, p = stats.ttest_ind(subject_AM_cell_ratios, subject_PM_cell_ratios)
    
    if HYPOTHESIS == "greater":
        if z < 0:
            p = 1-p
            
        p /= 2
    elif HYPOTHESIS == "less":
        if z > 0:
            p = 1 - p
        p /= 2
    
    p_values.append(p)
    
    num_samples.append(len(subject_AM_cell_ratios) + len(subject_PM_cell_ratios))
    
    AM_cell_ratios.extend(subject_AM_cell_ratios)
    PM_cell_ratios.extend(subject_PM_cell_ratios)

# We combine subjects' independent p values, since we are assuming each person may
# have a different baseline, but we are testing the same hypothesis
z, p_value = stats.combine_pvalues(p_values, method="stouffer", weights=num_samples)

if HYPOTHESIS == "greater" or HYPOTHESIS == "lesser":
    p_value *= 2

In [None]:
# For plotting upper bounds
y_max = max(max(AM_cell_ratios), max(PM_cell_ratios))

# Arbitrary x axis coordinates for plotly - just so they're distinct and numeric
AM_sample_types = numpy.array([0] * num_AM_samples)
PM_sample_types = numpy.array([1] * num_PM_samples)


AM_box_trace = graph_objects.Box(
    x=AM_sample_types,
    y=AM_cell_ratios,
    marker={
        "color": "blue"
    },
    showlegend=False
)

PM_box_trace = graph_objects.Box(
    x=PM_sample_types,
    y=PM_cell_ratios,
    marker={
        "color": "red"
    },
    showlegend=False
)

AM_sample_types_jittered = AM_sample_types - 0.35
AM_sample_types_jittered += (0.5-numpy.random.rand(len(AM_sample_types_jittered)))/10

PM_sample_types_jittered = PM_sample_types - 0.35
PM_sample_types_jittered += (0.5-numpy.random.rand(len(PM_sample_types_jittered)))/10

AM_scatter_trace = graph_objects.Scatter(
    x=AM_sample_types_jittered,
    y=AM_cell_ratios,
    marker_color=AM_marker_colors,
    mode="markers",
    showlegend=False,
    name="AM",
    text=AM_text_labels
)

PM_scatter_trace = graph_objects.Scatter(
    x=PM_sample_types_jittered,
    y=PM_cell_ratios,
    marker_color=PM_marker_colors,
    mode="markers",
    showlegend=False,
    name="PM",
    text=PM_text_labels
)

layout = graph_objects.Layout(
    {
        "xaxis": {
            "title": "Time of Day",
            "tickvals": [0, 1],
            "ticktext": ["AM", "PM"]
        },
        "yaxis": {
            "range": [0, y_max * 1.1],
            "title": "% of Cells"
        },
        "plot_bgcolor": "rgba(0, 0, 0, 0)",
        "width": 400,
        "title": {
            "text": "%s Distribution AM vs PM" % CELL_TYPE,
            "xanchor": "center",
            "xref": "paper",
            "x": 0.5
        }
    }
)

figure = graph_objects.Figure(data=[AM_box_trace, AM_scatter_trace, PM_box_trace, PM_scatter_trace], layout=layout)

significance_line = graph_objects.layout.Shape(
    type="line",
    x0=0,
    x1=1,
    y0=y_max * 1.1,
    y1=y_max * 1.1,
    line=dict(
        color="Black",
        width=5
    )
)

significance_bracket_left = graph_objects.layout.Shape(
    type="line",
    x0=0,
    x1=0,
    y0=y_max * 1.10,
    y1=y_max * 1.05,
    line=dict(
        color="Black",
        width=4
    )
)

significance_bracket_right = graph_objects.layout.Shape(
    type="line",
    x0=1,
    x1=1,
    y0=y_max * 1.10,
    y1=y_max * 1.05,
    line=dict(
        color="Black",
        width=4
    )
)

figure.add_shape(significance_line)
figure.add_shape(significance_bracket_left)
figure.add_shape(significance_bracket_right)

figure.add_annotation(
    graph_objects.layout.Annotation(
        text="p=%.1e" % p_value,
        showarrow=False,
        yanchor="bottom",
        yref="y",
        y=y_max * 1.10,
        x=0.5,
        xref="x",
        xanchor="center"
    )
)

plotly.iplot(figure)

figure.write_image(os.path.join("figures", "%s_ratio_AM_vs_PM.svg" % CELL_TYPE.replace(" ", "_")))
figure.write_html(os.path.join("figures", "%s_ratio_AM_vs_PM.html" % CELL_TYPE.replace(" ", "_")))