In [None]:
import os

import pandas

from scrapi.dataset import Gene_Expression_Dataset as GED

from capblood_seq import config as config

In [None]:
# Which pipeline we want to use to add labels to. We use our debris_filtered data,
# so that only cells that survive the debris filtering get a subject label
PIPELINE_NAME = "debris_filtered"

In [None]:
for sample in config.SAMPLE_NAMES:
    
    subject_cell_barcodes = { subject_id: [] for subject_id in config.SUBJECT_IDS}
    
    # Load the output from demuxlet for this sample
    demuxlet_file_path = os.path.join("data", sample, "demuxed.best")
    demuxed_df = pandas.read_csv(demuxlet_file_path, delim_whitespace=True, index_col=0, header=0)
    
    # Loop through all the entries to inspect each cell
    for row in demuxed_df.iterrows():
        
        cell_barcode = row[0]
        cell_call_text = row[1]["BEST"]
        
        # Check if any of the subjects are the "best" call from demuxlet
        for subject_id in config.SUBJECT_IDS:
            if cell_call_text == "SNG-%s_GENOTYPE" % subject_id:
                subject_cell_barcodes[subject_id].append(cell_barcode)
    
    # Label these subjects cells in the dataset
    dataset = GED(os.path.join("data", sample), name=PIPELINE_NAME)
    for subject_id in config.SUBJECT_IDS:
        dataset.label_cells(subject_id, subject_cell_barcodes[subject_id])
        
    if sample == "PM3":
        # Remove Subjects 1 and 4 from the last sample - they were not
        # sampled at this time point.
        dataset.delete_label("S1")
        dataset.delete_label("S4")
    
    dataset.save_labels()