In [None]:
import os
import pickle

import numpy
from scipy import stats

import capblood_seq
from capblood_seq import config

In [None]:
# Whether to normalize means within each subject
NORMALIZE_WITHIN_SUBJECT = True

In [None]:
# Load the dataset. This downloads it if it doesn't exist already, and loads it into memory
dataset = capblood_seq.load_dataset(data_directory="data", pipeline_name="normalized")
dataset.filter_multi_labeled_cells(config.CELL_TYPES)

In [None]:
cell_type_subject_sample_transcript_count_means = {}
cell_type_subject_sample_transcript_count_errors = {}

for cell_type_index, cell_type in enumerate(config.CELL_TYPES + [None]):

    for subject_index, subject_id in enumerate(config.SUBJECT_IDS):

        for sample in config.SAMPLE_NAMES:

            transcript_counts = dataset.get_transcript_counts(
                sample,
                cell_type=cell_type,
                subject_id=subject_id,
                normalized=True
            )
            
            if transcript_counts is None:
                continue
                
            transcript_counts = transcript_counts.to_array()
            
            cell_type_subject_sample_transcript_count_means[(cell_type, subject_id, sample)] = \
                transcript_counts.mean(axis=0)
            cell_type_subject_sample_transcript_count_errors[(cell_type, subject_id, sample)] = \
                stats.sem(transcript_counts, axis=0)

In [None]:
gene_cell_type_subject_data = {}

for gene_index, gene in enumerate(dataset.gene_list):
    
    gene_cell_type_subject_data[gene] = {}
    
    for cell_type in config.CELL_TYPES + [None]:
        
        gene_cell_type_subject_data[gene][cell_type] = {}

        for subject_id in config.SUBJECT_IDS:
            
            gene_cell_type_subject_data[gene][cell_type][subject_id] = {}

            x_values = []
            y_values = []
            errors = []

            for sample in config.SAMPLE_NAMES:
                
                cell_type_subject_sample = (cell_type, subject_id, sample)
                
                if cell_type_subject_sample not in cell_type_subject_sample_transcript_count_means:
                    continue

                y_values.append(cell_type_subject_sample_transcript_count_means[cell_type_subject_sample][gene_index])
                errors.append(cell_type_subject_sample_transcript_count_errors[cell_type_subject_sample][gene_index])
                x_values.append(sample)
                        
            if NORMALIZE_WITHIN_SUBJECT:
                
                num_samples = len(y_values)
                
                AM_means = numpy.array([y_values[i] for i in range(num_samples) if "AM" in x_values[i]])
                PM_means = numpy.array([y_values[i] for i in range(num_samples) if "PM" in x_values[i]])
                
                mean_of_means = numpy.mean([AM_means.mean(), PM_means.mean()])
                AM_means -= mean_of_means
                PM_means -= mean_of_means
                
                AM_index = 0
                PM_index = 0
                
                for i in range(num_samples):
                    if "AM" in x_values[i]:
                        y_values[i] = AM_means[AM_index]
                        AM_index += 1
                    else:
                        y_values[i] = PM_means[PM_index]
                        PM_index += 1
            
            gene_cell_type_subject_data[gene][cell_type][subject_id]["x_values"] = x_values
            gene_cell_type_subject_data[gene][cell_type][subject_id]["y_values"] = y_values
            gene_cell_type_subject_data[gene][cell_type][subject_id]["errors"] = errors

In [None]:
file_name = "gene_mean_traces"

if NORMALIZE_WITHIN_SUBJECT:
    file_name += "_subject_normalized"
file_name += ".pickle"

with open(os.path.join("data", file_name), "wb") as pickle_file:
    pickle.dump(gene_cell_type_subject_data, pickle_file)