In [None]:
import os

import numpy
import pandas
import scipy
from plotly import offline as plotly
from plotly import graph_objects
from scipy import stats
from statsmodels.stats import multitest

from pepars.plotting import plotting
plotting.init_notebook_mode()

import capblood_seq
from capblood_seq import config

In [None]:
FDR_THRESHOLDS = [0.05]
GENE_ABUNDANCE_FILTER = 0.1

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")

In [None]:
# We want to compare percentages only relative to the cells we were able to identify
dataset.filter_multi_labeled_cells(config.CELL_TYPES)
dataset.filter_multi_labeled_cells(config.SUBJECT_IDS)
dataset.filter_genes_by_percent_abundance(GENE_ABUNDANCE_FILTER, any_sample=True)

In [None]:
num_genes = dataset.get_num_genes()
num_cell_types = len(config.CELL_TYPES)
print("%s genes after filtering" % num_genes)

In [None]:
# Preload transcript counts into numpy arrays

cell_type_subject_sample_transcript_counts = {}

for cell_type_index, cell_type in enumerate(config.CELL_TYPES):
    for subject_index, subject_id in enumerate(config.SUBJECT_IDS + [None]):
        for sample in config.SAMPLE_NAMES:
            
            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type,
                subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            transcript_counts = transcript_counts.to_array()
            
            cell_type_subject_sample_transcript_counts[(cell_type, subject_id, sample)] = transcript_counts

In [None]:
individual_p_values = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
individual_f_statistics = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells_expressing_gene = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))

for cell_type_index, cell_type in enumerate(config.CELL_TYPES):

    for gene_index, gene in enumerate(dataset._gene_list):

        all_subject_gene_counts = []

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

            subject_gene_means = []

            for sample_index, sample in enumerate(config.SAMPLE_NAMES):
                
                cell_type_subject_sample = (cell_type, subject_id, sample)
                
                if cell_type_subject_sample not in cell_type_subject_sample_transcript_counts:
                    continue
                    
                transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
                gene_mean = transcript_counts.mean()
                subject_gene_means.append(gene_mean)
                num_cells_expressing_gene[gene_index, cell_type_index] += \
                    transcript_counts[transcript_counts > 0].shape[0]
                num_cells[gene_index, cell_type_index] += transcript_counts.shape[0]

            all_subject_gene_counts.append(subject_gene_means)

        if not numpy.any(numpy.any(all_subject_gene_counts)):
            p = numpy.nan
            f = numpy.nan
        else:
            f, p = scipy.stats.f_oneway(*all_subject_gene_counts)

        individual_p_values[gene_index, cell_type_index] = p
        individual_f_statistics[gene_index, cell_type_index] = f

In [None]:
# Get just the p values associated with particular cell types
individual_p_values_unwrapped = individual_p_values.reshape((-1, ))
individual_p_values_nan_mask = numpy.isnan(individual_p_values_unwrapped)

# Get just the non-nan ones for multiple comparison correction
individual_p_values_non_nan = individual_p_values_unwrapped[~individual_p_values_nan_mask]

individual_thresholds = []

for threshold in FDR_THRESHOLDS:
    # Multiple comparison correction at FDR = 0.1
    rejected, _, _, _ = multitest.multipletests(individual_p_values_non_nan, method="fdr_bh", alpha=threshold)
    p_value_threshold = individual_p_values_non_nan[rejected].max()
    individual_thresholds.append(individual_f_statistics[individual_p_values==p_value_threshold][0])

# Fill in any nans with 1 for minimization
individual_p_values[numpy.isnan(individual_p_values)] = 1

# Get the index of the lowest p-value cell type for each gene
lowest_cell_types = individual_p_values.argmin(axis=1)

# Get the associated z score for the lowest p-value
top_individual_f_statistics = numpy.array([individual_f_statistics[gene_index, lowest_index] for (gene_index, lowest_index) in enumerate(lowest_cell_types)])

In [None]:
individual_genes_df = \
    pandas.DataFrame(
        top_individual_f_statistics,
        index=dataset._gene_list,
        columns=["F"]
    ).sort_values(by="F", ascending=False)

individual_genes_df.to_csv(os.path.join("data", "gene_individuality.csv"))

In [None]:
cell_type_p_values = numpy.zeros((dataset.get_num_genes()))
cell_type_f_statistics = numpy.zeros((dataset.get_num_genes()))

for gene_index, gene in enumerate(dataset._gene_list):
    
    all_cell_type_gene_means = []

    for cell_type_index, cell_type in enumerate(config.CELL_TYPES):
        
        cell_type_gene_means = []
        
        for sample_index, sample in enumerate(config.SAMPLE_NAMES):
        
            cell_type_subject_sample = (cell_type, None, sample)
            
            transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
            cell_type_gene_means.append(transcript_counts.mean())
        
        all_cell_type_gene_means.append(cell_type_gene_means)

    if not numpy.any(numpy.any(all_cell_type_gene_means)):
        p = numpy.nan
        f = numpy.nan
    else:
        f, p = scipy.stats.f_oneway(*all_cell_type_gene_means)

    cell_type_p_values[gene_index] = p
    cell_type_f_statistics[gene_index] = f

In [None]:
cell_type_thresholds = []

for threshold in FDR_THRESHOLDS:
    
    rejected, _, _, _ = multitest.multipletests(cell_type_p_values, method="fdr_bh", alpha=threshold)
    p_value_threshold = cell_type_p_values[rejected].max()
    cell_type_thresholds.append(cell_type_f_statistics[cell_type_p_values==p_value_threshold][0])

In [None]:
cell_type_rejected, corrected_p_values, _, _ = multitest.multipletests(cell_type_p_values, method="fdr_bh", alpha=0.05)

cell_type_significant_genes = cell_type_rejected

cell_type_significant_genes_df = pandas.DataFrame(
    corrected_p_values[cell_type_significant_genes],
    index=numpy.array(dataset._gene_list)[cell_type_significant_genes],
    columns=["p"]
).sort_values(by="p")

In [None]:
percent_cells_expressing_gene = (num_cells_expressing_gene/num_cells).max(axis=1)

In [None]:
colors = [
    config.CELL_TYPE_COLORS[config.CELL_TYPES[cell_type_index]] for cell_type_index in lowest_cell_types
]

In [None]:
x_values = numpy.log2(cell_type_f_statistics)
y_values = numpy.log2(top_individual_f_statistics)

In [None]:
min_x = numpy.floor(x_values.min()*4)/4
max_x = numpy.ceil(x_values.max()*4)/4
min_y = numpy.floor(y_values.min()*4)/4
max_y = numpy.ceil(y_values.max()*4)/4

data = [
    graph_objects.Scatter(
        x=x_values,
        y=y_values,
        mode="markers",
        text=dataset._gene_list,
        marker={
            "size": 2+percent_cells_expressing_gene*7,
            "color": colors,
            "line": {
                "width": 1,
                "color": "rgba(0, 0, 0, 0.5)"
            }
        },
        opacity=0.5
    ),
    graph_objects.Histogram(
        x=x_values,
        name="x density",
        marker=dict(color="red", opacity=0.5),
        yaxis="y2"
    ),
    graph_objects.Histogram(
        y=y_values,
        name="y density",
        marker=dict(color="red", opacity=0.5),
        xaxis="x2"
    )
]

for threshold_index, threshold in enumerate(FDR_THRESHOLDS):
    line = graph_objects.Line(
        x=[
            numpy.log2(cell_type_thresholds[threshold_index]),
            numpy.log2(cell_type_thresholds[threshold_index])
        ],
        y=[min_y, max_y],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    )
    
    data.append(line)
    
    line = graph_objects.Line(
        y=[
            numpy.log2(individual_thresholds[threshold_index]),
            numpy.log2(individual_thresholds[threshold_index])
        ],
        x=[min_x, max_x],
        line=dict(
            color="black",
            width=2
        ),
        marker=dict(
            size=0,
            opacity=0
        )
    )
    
    data.append(line)

layout = graph_objects.Layout(
    title= "Subject and Cell Type ANOVA",
    showlegend=False,
    autosize=False,
    width=850,
    height=850,
    xaxis=dict(
        range=[min_x, max_x],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="Cell Type F-statistic (Log2)"),
    yaxis=dict(
        range=[min_y, max_y],
        domain=[0, 0.83],
        showgrid=False,
        zeroline=False,
        title="Subject F-statistic (Log2)"
    ),
    margin=dict(
        l=20,
        t=100),
    hovermode="closest",
    bargap=0.1,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False,
        title="# Genes"
    ),
    plot_bgcolor="rgba(255, 255, 255, 0)",
    paper_bgcolor="rgba(255, 255, 255, 0)"
)

figure = graph_objects.Figure( data=data, layout=layout )

plotly.iplot(figure)

In [None]:
for threshold_index, threshold in enumerate(FDR_THRESHOLDS):
    num_above_cell_type_threshold = (x_values > numpy.log2(cell_type_thresholds[threshold_index])).sum()
    num_above_individual_threshold = (y_values > numpy.log2(individual_thresholds[threshold_index])).sum()
    
    print("FDR %.2f" % threshold)
    print("Num above cell type threshold: %i/%i" % (num_above_cell_type_threshold, x_values.shape[0]))
    print("Num above individual threshold: %i/%i" % (num_above_individual_threshold, y_values.shape[0]))

In [None]:
figure.write_image(os.path.join("figures", "individual_cell_type_significance.svg"))
figure.write_html(os.path.join("figures", "individual_cell_type_significance.html"))

In [None]:
percent_at_rank = []
individual_specific_count = 0

total_count = 0

for gene_index in (-x_values).argsort():
    total_count += 1
    if y_values[gene_index] > cell_type_thresholds[0]:
        individual_specific_count += 1
    percent_at_rank.append(individual_specific_count/total_count)
    
rho, p = scipy.stats.spearmanr(x_values, y_values)

In [None]:
scatter = graph_objects.Scatter(
    x=numpy.array(list(range(len(dataset._gene_list))))+1,
    y=numpy.array(percent_at_rank)*100,
    mode="markers",
    text=[dataset._gene_list[x] for x in (-cell_type_f_statistics).argsort()]
),

layout = graph_objects.Layout(
    title= "Subject and Cell Type Rank Correlation (ρ = %.3f)" % rho,
    showlegend=False,
    xaxis=dict(
        title="Cell Type Specificity Rank"
    ),
    yaxis=dict(
        gridcolor="rgba(0, 0, 0, 0.1)",
        title="% of genes Individual-Specific",
        range=[0, 100]
    ),
    hovermode="closest",
    plot_bgcolor="rgba(255, 255, 255, 0)",
    paper_bgcolor="rgba(255, 255, 255, 0)"
)

figure = graph_objects.Figure(data=scatter, layout=layout )

plotly.iplot(figure)

In [None]:
figure.write_image(os.path.join("figures","cell_type_individual_rank_correlation.svg"))
figure.write_html(os.path.join("figures","cell_type_individual_rank_correlation.html"))