In [None]:
import os

import pandas
import numpy

from plotly import graph_objects
from plotly import offline as plotly
from plotly.subplots import make_subplots

from capblood_seq import config

from scrapi.dataset import Gene_Expression_Dataset as GED

from pepars.plotting import plotting

In [None]:
columns = ["Cellranger Called", "Removed", "Added", "Final # Cells", "% Removed"]
data_types = {
    columns[0]: int,
    columns[1]: int,
    columns[2]: int,
    columns[3]: int,
    columns[4]: float
}

df = pandas.DataFrame(columns=columns)

for sample in config.SAMPLE_NAMES:
    
    dataset_path = os.path.join("data", sample)
    
    ged = GED(dataset_path, name="debris_filtered")
    debris_filtered_cell_barcodes = ged.get_cells()
    ged.load("cellranger_filtered")
    cellranger_filtered_cell_barcodes = ged.get_cells()
    
    removed_barcodes = cellranger_filtered_cell_barcodes.difference(debris_filtered_cell_barcodes)
    remaining_barcodes = cellranger_filtered_cell_barcodes.intersection(debris_filtered_cell_barcodes)
    added_barcodes = debris_filtered_cell_barcodes.difference(cellranger_filtered_cell_barcodes)
    
    series = pandas.Series(index=columns, dtype=float)
    
    series["Cellranger Called"] = len(cellranger_filtered_cell_barcodes)
    series["Removed"] = len(removed_barcodes)
    series["Added"] = len(added_barcodes)
    series["Final # Cells"] = len(debris_filtered_cell_barcodes)
    series["% Removed"] = numpy.round(len(removed_barcodes)/len(cellranger_filtered_cell_barcodes) * 100, 2)
    
    df.loc[sample] = series

for k, v in data_types.items():
    df[k] = df[k].astype(v)
df

In [None]:
df.to_csv(os.path.join("data", "debris_filtering_statistics.csv"))

In [None]:
figure = make_subplots(
    rows=3,
    cols=2,
#     vertical_spacing=0.2,
    subplot_titles=config.SAMPLE_NAMES
)

filtered_x = []
filtered_y = []

removed_x = []
removed_y = []

row_index = 0
column_index = 0
legend_shown = False

for sample in config.SAMPLE_NAMES:
    
    dataset_path = os.path.join("data", sample)
    
    ged = GED(dataset_path, name="debris_filtered")
    debris_filtered_cell_barcodes = ged.get_cells()
    ged.load("cellranger_filtered")
    cellranger_filtered_cell_barcodes = ged.get_cells()
    
    removed_barcodes = cellranger_filtered_cell_barcodes.difference(debris_filtered_cell_barcodes)
    remaining_barcodes = cellranger_filtered_cell_barcodes.intersection(debris_filtered_cell_barcodes)
    added_barcodes = debris_filtered_cell_barcodes.difference(cellranger_filtered_cell_barcodes)
    
    mt_genes = []

    for gene in ged.get_genes():
        if gene.lower().startswith("mt-"):
            mt_genes.append(gene)
    
    union_barcodes = debris_filtered_cell_barcodes.union(cellranger_filtered_cell_barcodes)    
    union_transcript_counts = ged._barcode_transcript_counts[list(union_barcodes)]
    
    removed_total_transcript_counts = union_transcript_counts[list(removed_barcodes)].sum(axis=1)
    removed_mt_ratio = union_transcript_counts[list(removed_barcodes), mt_genes].sum(axis=1)/removed_total_transcript_counts
    
    removed_x.extend(removed_total_transcript_counts)
    removed_y.extend(removed_mt_ratio)

    removed_scatter = graph_objects.Scatter(
        x=removed_total_transcript_counts,
        y=removed_mt_ratio,
        mode="markers",
        marker={
            "color": "red",
            "size": 1.5
        },
        name="Debris",
        showlegend=not legend_shown
    )
    
    figure.add_trace(removed_scatter, row=row_index + 1, col=column_index + 1)

    filtered_total_transcript_counts = union_transcript_counts[list(debris_filtered_cell_barcodes)].sum(axis=1)
    filtered_mt_ratio = union_transcript_counts[list(debris_filtered_cell_barcodes), mt_genes].sum(axis=1)/filtered_total_transcript_counts
    
    filtered_x.extend(filtered_total_transcript_counts)
    filtered_y.extend(filtered_mt_ratio)
    
    filtered_scatter = graph_objects.Scatter(
        x=filtered_total_transcript_counts,
        y=filtered_mt_ratio,
        mode="markers",
        marker={
            "color": "blue",
            "size": 1.5
        },
        name="Debris Filtered",
        showlegend=not legend_shown
    )
    
    legend_shown = True
    
    figure.add_trace(filtered_scatter, row=row_index + 1, col=column_index + 1)
    
    figure.update_yaxes(
        {
            "range": [0, 1],
            "title": "Mitochondrial Gene Ratio"
        },
        row=row_index + 1,
        col=column_index + 1
    )
    
    figure.update_xaxes(
        {
            "title": "Number of Transcripts"
        },
        row=row_index + 1,
        col=column_index + 1
    )
    
    column_index += 1
    
    if column_index == 2:
        column_index = 0
        row_index += 1

In [None]:
layout = {
    "plot_bgcolor": "rgba(0, 0, 0, 0)",
    "paper_bgcolor": "rgba(0, 0, 0, 0)",
    "height": 1000
}

figure.update_layout(layout)

plotly.iplot(figure)

In [None]:
figure.write_image(os.path.join("figures", "debris_filtering_mitochondrial_ratio.svg"))
figure.write_html(os.path.join("figures", "debris_filtering_mitochondrial_ratio.html"))