In [None]:
import os

import numpy
import pandas
import scipy
from plotly import offline as plotly
from plotly import graph_objects
from scipy import stats
from statsmodels.stats import multitest

from pepars.plotting import plotting
plotting.init_notebook_mode()

import capblood_seq
from capblood_seq import config

In [None]:
FDR_THRESHOLDS = [0.05]
GENE_ABUNDANCE_FILTER = 0.1

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")

In [None]:
# We want to compare percentages only relative to the cells we were able to identify
dataset.filter_multi_labeled_cells(config.CELL_TYPES)
dataset.filter_multi_labeled_cells(config.SUBJECT_IDS)
dataset.filter_genes_by_percent_abundance(GENE_ABUNDANCE_FILTER, any_sample=True)

In [None]:
num_genes = dataset.get_num_genes()
num_cell_types = len(config.CELL_TYPES)
print("%s genes after filtering" % num_genes)

In [None]:
# Preload transcript counts into numpy arrays

cell_type_subject_sample_transcript_counts = {}

for cell_type_index, cell_type in enumerate(config.CELL_TYPES + [None]):
    for subject_index, subject_id in enumerate(config.SUBJECT_IDS + [None]):
        for sample in config.SAMPLE_NAMES:
            
            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type,
                subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            transcript_counts = transcript_counts.to_array()
            
            cell_type_subject_sample_transcript_counts[(cell_type, subject_id, sample)] = transcript_counts

In [None]:
individual_p_values = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
individual_f_statistics = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells_expressing_gene = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))

subject_cell_type_p_values = []
subject_cell_type_p_value_subject_ids = []

for cell_type_index, cell_type in enumerate(config.CELL_TYPES):

    for gene_index, gene in enumerate(dataset._gene_list):

        all_subject_gene_counts = []

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

            subject_gene_means = []

            for sample_index, sample in enumerate(config.SAMPLE_NAMES):
                
                cell_type_subject_sample = (cell_type, subject_id, sample)
                
                if cell_type_subject_sample not in cell_type_subject_sample_transcript_counts:
                    continue
                    
                transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
                gene_mean = transcript_counts.mean()
                subject_gene_means.append(gene_mean)
                num_cells_expressing_gene[gene_index, cell_type_index] += \
                    transcript_counts[transcript_counts > 0].shape[0]
                num_cells[gene_index, cell_type_index] += transcript_counts.shape[0]

            all_subject_gene_counts.append(subject_gene_means)

        if not numpy.any(numpy.any(all_subject_gene_counts)):
            p = numpy.nan
            f = numpy.nan
        else:
            f, p = scipy.stats.f_oneway(*all_subject_gene_counts)

        individual_p_values[gene_index, cell_type_index] = p
        individual_f_statistics[gene_index, cell_type_index] = f

In [None]:
# Get just the p values associated with particular cell types
individual_p_values_unwrapped = individual_p_values.reshape((-1, ))
individual_p_values_nan_mask = numpy.isnan(individual_p_values_unwrapped)

# Get just the non-nan ones for multiple comparison correction
individual_p_values_non_nan = individual_p_values_unwrapped[~individual_p_values_nan_mask]

individual_thresholds = []

for threshold in FDR_THRESHOLDS:
    # Multiple comparison correction at FDR = 0.1
    rejected, _, _, _ = multitest.multipletests(individual_p_values_non_nan, method="fdr_bh", alpha=threshold)
    p_value_threshold = individual_p_values_non_nan[rejected].max()
    individual_thresholds.append(individual_f_statistics[individual_p_values==p_value_threshold][0])

# Fill in any nans with 1 for minimization
individual_p_values[numpy.isnan(individual_p_values)] = 1

# Get the index of the lowest p-value cell type for each gene
lowest_cell_types = individual_p_values.argmin(axis=1)

# Get the associated z score for the lowest p-value
top_individual_f_statistics = numpy.array([individual_f_statistics[gene_index, lowest_index] for (gene_index, lowest_index) in enumerate(lowest_cell_types)])

In [None]:
top_individual_p_values = numpy.array([individual_p_values[gene_index, lowest_index] for (gene_index, lowest_index) in enumerate(lowest_cell_types)])

In [None]:
any_cell_type_rejected = (top_individual_p_values < p_value_threshold)

In [None]:
all_p_values = []
gene_subject_cell_types = []

for gene_index, gene in enumerate(dataset._gene_list):
    
    for cell_type_index, cell_type in enumerate(config.CELL_TYPES):

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):
            
            subject_gene_means = []
            other_gene_means = []
            
            for other_subject_index, other_subject_id in enumerate(config.SUBJECT_IDS):

                for sample_index, sample in enumerate(config.SAMPLE_NAMES):
                
                    cell_type_subject_sample = (cell_type, other_subject_id, sample)

                    if cell_type_subject_sample not in cell_type_subject_sample_transcript_counts:
                        continue

                    transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
                    gene_mean = transcript_counts.mean()

                    if subject_id == other_subject_id:
                        subject_gene_means.append(gene_mean)
                    else:
                        other_gene_means.append(gene_mean)
                
            _, p_value = stats.ttest_ind(subject_gene_means, other_gene_means)
            
            all_p_values.append(p_value)
            gene_subject_cell_types.append((gene, cell_type, subject_id))

In [None]:
subject_significant_genes = {x: set() for x in config.SUBJECT_IDS}

for threshold in FDR_THRESHOLDS:
    # Multiple comparison correction at FDR = 0.1
    rejected, _, _, _ = multitest.multipletests(all_p_values, method="fdr_bh", alpha=threshold)
    
    for gene_subject_cell_type in numpy.array(gene_subject_cell_types)[rejected]:
        subject_significant_genes[gene_subject_cell_type[2]].add(gene_subject_cell_type[0])
    
    subject_significant_counts = {x: len(subject_significant_genes[x]) for x in subject_significant_genes}

In [None]:
for threshold in FDR_THRESHOLDS:
    
    subject_cell_type_significant_genes = {cell_type: {x: set() for x in config.SUBJECT_IDS} for cell_type in config.CELL_TYPES}
    
    # Multiple comparison correction at FDR = 0.1
    rejected, _, _, _ = multitest.multipletests(all_p_values, method="fdr_bh", alpha=threshold)
    
    for gene_subject_cell_type in numpy.array(gene_subject_cell_types)[rejected]:
        subject_cell_type_significant_genes[gene_subject_cell_type[1]][gene_subject_cell_type[2]].add(gene_subject_cell_type[0])
        
    subject_cell_type_significant_counts = {cell_type: {x: len(y) for x, y in subject_cell_type_significant_genes[cell_type].items()} for cell_type in subject_cell_type_significant_genes}

In [None]:
subject_cell_type_significance_df = pandas.DataFrame(subject_cell_type_significant_counts)
subject_cell_type_significance_df["Any"] = pandas.Series(subject_significant_counts)
subject_cell_type_significance_df

In [None]:
subject_cell_type_significance_df.to_csv(os.path.join("data", "subject_cell_type_significance_counts.csv"))