In [None]:
import os

import numpy
import pandas
from scipy import stats

import capblood_seq
from capblood_seq import config

In [None]:
GENE_ABUNDANCE_FILTER = 0.1

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")

In [None]:
# We want to compare percentages only relative to the cells we were able to identify
dataset.filter_multi_labeled_cells(config.CELL_TYPES)
dataset.filter_genes_by_percent_abundance(GENE_ABUNDANCE_FILTER, any_sample=True)

In [None]:
# Preload transcript counts into numpy arrays for faster access

cell_type_subject_sample_transcript_counts = {}

for cell_type_index, cell_type in enumerate(config.CELL_TYPES + [None]):
    for subject_index, subject_id in enumerate(config.SUBJECT_IDS):
        for sample in config.SAMPLE_NAMES:
            
            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type,
                subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            transcript_counts = transcript_counts.to_array()
            
            cell_type_subject_sample_transcript_counts[(cell_type, subject_id, sample)] = transcript_counts
    
    for sample in config.SAMPLE_NAMES:
        
        transcript_counts = dataset.get_transcript_counts(
            sample,
            cell_type,
            subject_id=None,
            normalized=True
        )

        if transcript_counts is None:
            continue

        transcript_counts = transcript_counts.to_array()

        cell_type_subject_sample_transcript_counts[(cell_type, None, sample)] = transcript_counts

In [None]:
individual_p_values = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
individual_f_statistics = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells_expressing_gene = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))
num_cells = numpy.zeros((dataset.get_num_genes(), len(config.CELL_TYPES)))

for cell_type_index, cell_type in enumerate(config.CELL_TYPES):

    for gene_index, gene in enumerate(dataset._gene_list):

        all_subject_gene_counts = []

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

            subject_gene_means = []

            for sample_index, sample in enumerate(config.SAMPLE_NAMES):
                
                cell_type_subject_sample = (cell_type, subject_id, sample)
                
                if cell_type_subject_sample not in cell_type_subject_sample_transcript_counts:
                    continue
                    
                transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
                gene_mean = transcript_counts.mean()
                subject_gene_means.append(gene_mean)
                num_cells_expressing_gene[gene_index, cell_type_index] += \
                    transcript_counts[transcript_counts > 0].shape[0]
                num_cells[gene_index, cell_type_index] += transcript_counts.shape[0]

            all_subject_gene_counts.append(subject_gene_means)

        if not numpy.any(numpy.any(all_subject_gene_counts)):
            p = numpy.nan
            f = numpy.nan
        else:
            f, p = stats.f_oneway(*all_subject_gene_counts)

        individual_p_values[gene_index, cell_type_index] = p
        individual_f_statistics[gene_index, cell_type_index] = f

In [None]:
# Fill in any nans with 1 for minimization
individual_p_values[numpy.isnan(individual_p_values)] = 1

In [None]:
individual_genes_df = \
    pandas.DataFrame(
        numpy.concatenate(
            (
                individual_f_statistics,
                numpy.array(
                    [individual_f_statistics[i, j] for i, j in enumerate(individual_p_values.argmin(axis=1))]
                ).reshape((-1, 1)),
                individual_p_values,
                numpy.array(
                    [individual_p_values[i, j] for i, j in enumerate(individual_p_values.argmin(axis=1))]
                ).reshape((-1, 1))
            ),
            axis=1
        ),
        index=dataset._gene_list,
        columns=[cell_type + " F Statistic" for cell_type in config.CELL_TYPES + ["By Cell Type"]] + \
            [cell_type + " p-value" for cell_type in config.CELL_TYPES + ["By Cell Type"]]
    )

In [None]:
individual_genes_df["Max Cell Type"] = [config.CELL_TYPES[i] for i in individual_p_values.argmin(axis=1)]

In [None]:
cell_type_p_values = numpy.zeros((dataset.get_num_genes()))
cell_type_f_statistics = numpy.zeros((dataset.get_num_genes()))

for gene_index, gene in enumerate(dataset._gene_list):
    
    all_cell_type_gene_means = []

    for cell_type_index, cell_type in enumerate(config.CELL_TYPES):
        
        cell_type_gene_means = []
        
        for sample_index, sample in enumerate(config.SAMPLE_NAMES):
        
            cell_type_subject_sample = (cell_type, None, sample)
            
            transcript_counts = cell_type_subject_sample_transcript_counts[cell_type_subject_sample][:, gene_index]
            cell_type_gene_means.append(transcript_counts.mean())
        
        all_cell_type_gene_means.append(cell_type_gene_means)

    if not numpy.any(numpy.any(all_cell_type_gene_means)):
        p = numpy.nan
        f = numpy.nan
    else:
        f, p = stats.f_oneway(*all_cell_type_gene_means)

    cell_type_p_values[gene_index] = p
    cell_type_f_statistics[gene_index] = f

In [None]:
individual_genes_df["Cell Type F Statistic"] = cell_type_f_statistics
individual_genes_df["Cell Type p-value"] = cell_type_p_values

In [None]:
percent_cells_expressing_gene = (num_cells_expressing_gene/num_cells).max(axis=1)

In [None]:
individual_genes_df["Percent Cells Expressing"] = percent_cells_expressing_gene

In [None]:
individual_genes_df.to_csv(os.path.join("data", "gene_individuality_scores.csv"))