In [None]:
import os
import pickle

import numpy
from scipy import stats
from statsmodels.stats import multitest

import capblood_seq
from capblood_seq import config

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")
dataset.filter_multi_labeled_cells(config.CELL_TYPES)

In [None]:
# Whether to pool subjects into one t-test (True) or perform a test on each subject
# separately and then combine via Stouffer's method (False)
POOL_SUBJECTS = False

# Whether to normalize means within each subject - recommend doing this
# if POOL_SUBJECTS is True
NORMALIZE_WITHIN_SUBJECT = False

In [None]:
cell_type_subject_sample_transcript_count_means = {}

for cell_type_index, cell_type in enumerate(config.CELL_TYPES + [None]):

    for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

        for sample in config.SAMPLE_NAMES:

            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type=cell_type,
                subject_id=subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
            
            cell_type_subject_sample_transcript_count_means[(cell_type, subject_id, sample)] = \
                transcript_counts.to_array().mean(axis=0)

In [None]:
gene_data = {}

for gene_index, gene in enumerate(dataset._gene_list):
    
    AM_colors = []
    PM_colors = []
    AM_means = []
    PM_means = []
    p_values = []
    AM_mean_groups = []
    PM_mean_groups = []
    cell_type_label_list = []

    for cell_type_index, cell_type in enumerate(config.CELL_TYPES + [None]):

        cell_type_AM_means = []
        cell_type_PM_means = []

        if cell_type is None:
            cell_type_label = "All Cells"
        else:
            cell_type_label = cell_type

        cell_type_label_list.append(cell_type_label)

        if not POOL_SUBJECTS:
            num_samples_per_subject = []
            p_values_per_subject = []

        for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

            subject_AM_means = []
            subject_PM_means = []

            for sample in config.SAMPLE_NAMES:
                
                if (cell_type, subject_id, sample) not in cell_type_subject_sample_transcript_count_means:
                    continue
                
                gene_mean = cell_type_subject_sample_transcript_count_means[
                    (cell_type, subject_id, sample)
                ][gene_index]

                if "AM" in sample:
                    subject_AM_means.append(gene_mean)
                    AM_colors.append(config.SUBJECT_ID_COLORS[subject_id])
                    AM_mean_groups.append(cell_type_index)
                else:
                    subject_PM_means.append(gene_mean)
                    PM_colors.append(config.SUBJECT_ID_COLORS[subject_id])
                    PM_mean_groups.append(cell_type_index)

            subject_AM_means = numpy.array(subject_AM_means)
            subject_PM_means = numpy.array(subject_PM_means)

            num_samples = len(subject_AM_means) + len(subject_PM_means)

            if NORMALIZE_WITHIN_SUBJECT:
                mean_of_means = \
                    (subject_AM_means.mean() * len(subject_PM_means) + \
                    subject_PM_means.mean() * len(subject_AM_means))/num_samples
                subject_AM_means -= mean_of_means
                subject_PM_means -= mean_of_means
            cell_type_AM_means.extend(subject_AM_means)
            cell_type_PM_means.extend(subject_PM_means)

            if not POOL_SUBJECTS:
                z, p_value = stats.ttest_ind(subject_AM_means, subject_PM_means)
                if numpy.isnan(z):
                    continue
                num_samples_per_subject.append(num_samples)
                p_values_per_subject.append(p_value)

        if not POOL_SUBJECTS:
            _, p_value = stats.combine_pvalues(p_values_per_subject, weights=num_samples_per_subject, method="stouffer")
        else:
            _, p_value = stats.ttest_ind(cell_type_AM_means, cell_type_PM_means)
        p_values.append(p_value)
        AM_means.extend(cell_type_AM_means)
        PM_means.extend(cell_type_PM_means)

    AM_mean_groups = numpy.array(AM_mean_groups)
    PM_mean_groups = numpy.array(PM_mean_groups)

    AM_mean_groups_jittered = AM_mean_groups.astype(numpy.float32).copy()
    PM_mean_groups_jittered = PM_mean_groups.astype(numpy.float32).copy()

    for cell_type_index in range(len(cell_type_label_list)):
        AM_mean_groups_jittered[AM_mean_groups_jittered == cell_type_index] = cell_type_index -0.35
        PM_mean_groups_jittered[PM_mean_groups_jittered == cell_type_index] = cell_type_index

    AM_mean_groups_jittered += numpy.random.rand(len(AM_mean_groups_jittered))/40
    PM_mean_groups_jittered += numpy.random.rand(len(PM_mean_groups_jittered))/40
    
    data = {
        "AM_colors": AM_colors,
        "PM_colors": PM_colors,
        "AM_means": AM_means,
        "PM_means": PM_means,
        "p_values": p_values,
        "AM_mean_groups": AM_mean_groups,
        "PM_mean_groups": PM_mean_groups,
        "cell_type_label_list": cell_type_label_list,
        "AM_mean_groups_jittered": AM_mean_groups_jittered,
        "PM_mean_groups_jittered": PM_mean_groups_jittered
    }
    
    gene_data[gene] = data

In [None]:
file_name = "gene_AM_PM_means"
if NORMALIZE_WITHIN_SUBJECT:
    file_name += "_subject_normalized"
file_name += ".pickle"

with open(os.path.join("data", file_name), "wb") as pickle_file:
    pickle.dump(gene_data, pickle_file)