In [None]:
import os
import pandas
from capblood_seq import config

from sparsedat import Sparse_Data_Table as SDT
from scrapi.dataset import Gene_Expression_Dataset as GED

In [None]:
# Which pipeline we want to use to add labels to. We use our debris_filtered data,
# so that only cells that survive the debris filtering get a subject label
PIPELINE_NAME = "debris_filtered"

SUBJECT_LABELS = [
    "S4",
    "S2",
    "S1",
    "S3"
]

In [None]:
sample_subject_cell_barcodes = {}

for sample in config.SAMPLE_NAMES:
    sample_subject_cell_barcodes[sample] = {x: set() for x in config.SUBJECT_IDS}

demux_results = pandas.read_csv(os.path.join("data", "freemuxlet.clust1.samples"), delim_whitespace=True, index_col=0, header=0)

for row in demux_results.iterrows():
        
    if row[1]["DROPLET.TYPE"] == "SNG":
        cell_barcode_elements = row[1]["BARCODE"].split("-")
        cell_barcode = cell_barcode_elements[0] + "-1"
        sample_index = int(cell_barcode_elements[1])
        best_guess = row[1]["BEST.GUESS"]
        subject_index_1 = int(best_guess.split(",")[0])
        subject_index_2 = int(best_guess.split(",")[1])
        
        if subject_index_1 == subject_index_2:
            sample_subject_cell_barcodes[config.SAMPLE_NAMES[sample_index]][SUBJECT_LABELS[subject_index_1]].add(cell_barcode)

In [None]:
for sample in config.SAMPLE_NAMES:
    
    dataset = GED(os.path.join("data", sample), name=PIPELINE_NAME)
    
    for subject_label, cell_barcodes in sample_subject_cell_barcodes[sample].items():
        
        if subject_label in dataset.get_labels():
            dataset.delete_label(subject_label)
        
        dataset.label_cells(subject_label, cell_barcodes)
        
    if sample == "PM3":
        # Remove Subjects 1 and 4 from the last sample - they were not
        # sampled at this time point.
        dataset.delete_label("S1")
        dataset.delete_label("S4")
        
    dataset.save_labels()