In [None]:
import os

import pandas as pd
import numpy as np

import scanpy as sc
import quicat

#from venny4py.venny4py import *
from upsetplot import UpSet, from_memberships

import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
reports_dir = '/home/daniele/Code/github_synced/barcoding/quicat_paper_code/reports/'
dpi=300

# YOGESH

In [None]:
path = '/mnt/storage/Daniele/quicat_benchmark/dna/yogesh/'

### bartab

In [None]:
bartab = pd.read_csv(f'{path}bartab/counts/all_counts_combined.tsv', delimiter = '\t',).set_index('Barcode').T
samples_bartab = bartab.index.str.replace(r'_S\d+_L\d+_R1_\d+', '', regex=True)
bartab['sample'] = samples_bartab
bartab = bartab.groupby('sample').sum()

In [None]:
sample_raw_counts = bartab.sum(axis=1)
bartab_frequencies = bartab.div(sample_raw_counts, axis=0)*100
barcodes_filtered_bartab = set(bartab_frequencies.columns[(bartab_frequencies > 0.001).any(axis=0)])
barcodes_bartab_frequencies_filtered = bartab_frequencies.loc[:,list(bartab_frequencies.columns[(bartab_frequencies > 0.001).any(axis=0)])]

### quicat

In [None]:
ad = quicat.read_dna(f'{path}quicat/barcodes_output.csv')
barcodes_filtered_quicat = set(ad.var_names)

### pycashier

In [None]:
samples = os.listdir(f'{path}pycashier/outs/')

In [None]:
dfs={}
for sample in samples:
    name = sample[:sample.find('.q20')]
    dfs[sample] = pd.read_csv(f'{path}pycashier/outs/{sample}', delimiter='\t').set_index('barcode')

In [None]:
pycashier = pd.concat(dfs.values(), axis=1).fillna(0).T
pycashier.index = [sample[:sample.find('.q20')] for sample in dfs.keys()]
samples_pycashier = pycashier.index.str.replace(r'_S\d+_L\d+_R1_\d+', '', regex=True)
pycashier['sample'] = samples_pycashier
pycashier = pycashier.groupby('sample').sum()

In [None]:
sample_raw_counts = pycashier.sum(axis=1)
pycashier_frequencies = pycashier.div(sample_raw_counts, axis=0)*100
barcodes_filtered_pycashier = set(pycashier_frequencies.columns[(pycashier_frequencies > 0.001).any(axis=0)])
barcodes_pycashier_frequencies_filtered = pycashier_frequencies.loc[:,list(pycashier_frequencies.columns[(pycashier_frequencies > 0.001).any(axis=0)])]

## plot venn

In [None]:
colors = ['#332288', '#DDCC77', '#CC6677', '#44AA99']

In [None]:
sets = {
    'Pycashier': barcodes_filtered_pycashier,
    'Quicat': barcodes_filtered_quicat,
    'Bartab': barcodes_filtered_bartab,
}

venny4py(sets, dpi=300, out = f'{reports_dir}figures/fig2/', ext = 'dna_yogesh.pdf', colors = colors, line_width = .01, size = 5)

## Replicates plots quicat

In [None]:
sc.pp.pca(ad, n_comps = 80)

In [None]:
ad.X = ad.X.todense()

In [None]:
sc.tl.dendrogram(ad, groupby='sample', use_rep = 'X', cor_method='spearman')
sc.pl.correlation_matrix(
    ad, 
    groupby='sample', 
    cmap='coolwarm', 
    show=False,
    vmin=0,
    dendrogram = False,
)

plt.title('Sample Correlation Matrix', fontsize=14)
plt.tight_layout()

plt.savefig(f'{reports_dir}figures/fig2/sample_correlation_matrix.pdf', dpi=dpi, bbox_inches='tight')


In [None]:
sc.tl.dendrogram(ad, groupby='replicate', use_rep = 'X', cor_method='spearman')
sc.pl.correlation_matrix(
    ad, 
    groupby='replicate', 
    cmap='coolwarm', 
    show=False,
    linewidth=.1,
    #vmin=0
)

plt.title('Replicates Correlation Matrix', fontsize=14)
plt.tight_layout()

plt.savefig(f'{reports_dir}figures/fig2/replicates_correlation_matrix.pdf', dpi=dpi, bbox_inches='tight')


### Missing overlaps

In [None]:
pycashier_unique = barcodes_filtered_pycashier - (barcodes_filtered_bartab | barcodes_filtered_quicat)

In [None]:
lens_pycashier = [len(bc) for bc in pycashier_unique]
uniques, counts = np.unique(lens_pycashier, return_counts=True)

In [None]:
df = pd.DataFrame({"Unique Values": uniques, "Counts": counts}).set_index("Unique Values")

In [None]:
plt.figure(figsize=(10, 6))
kde_plot = sb.kdeplot(df.index, bw_adjust=0.5, label="Kernel Density of Counts", color='skyblue')
x_vals = kde_plot.get_lines()[0].get_data()[0]
y_vals = kde_plot.get_lines()[0].get_data()[1]
plt.fill_between(x_vals, 0, y_vals, color="orange", alpha=0.2)
plt.xticks(ticks=range(90,110, 5), fontsize=10)
plt.xlabel("Barcodes length (bp)",)
plt.ylabel("Density",)
plt.title("Density estimation of barcodes' length")
plt.savefig(f'{reports_dir}figures/fig2/density_plot_pycashier_barcodes.pdf', dpi=dpi, bbox_inches='tight')


In [None]:
missed_quicat = (barcodes_filtered_bartab & barcodes_filtered_pycashier) - barcodes_filtered_quicat

In [None]:
max_values = []
for column in barcodes_bartab_frequencies_filtered.loc[:, list(missed_quicat)].columns:
    max_values.append(np.mean([barcodes_bartab_frequencies_filtered[column].max(), barcodes_pycashier_frequencies_filtered[column].max()]))

In [None]:
uniques, counts = np.unique(max_values, return_counts=True)

In [None]:
df = pd.DataFrame({"Unique Values": uniques, "Counts": counts}).set_index("Unique Values")

In [None]:
plt.figure(figsize=(10, 6))
kde_plot = sb.kdeplot(df.index, bw_adjust=.25, label="Kernel Density of Counts", color='skyblue')
x_vals = kde_plot.get_lines()[0].get_data()[0]
y_vals = kde_plot.get_lines()[0].get_data()[1]
plt.fill_between(x_vals, 0, y_vals, color="orange", alpha=0.2)
plt.xticks(ticks=np.arange(0,0.2, 0.001), fontsize=10)
plt.xlim(0, 0.01)

plt.xlabel("Barcodes frequencies (%)",)
plt.ylabel("Density")
plt.title("Density estimation of barcodes' frequencies", fontsize=10)
plt.savefig(f'{reports_dir}figures/fig2/density_plot_quicat_missed_barcodes.pdf', dpi=dpi, bbox_inches='tight')


In [None]:
missed_bartab = (barcodes_filtered_quicat & barcodes_filtered_pycashier) - barcodes_filtered_bartab

In [None]:
len(missed_bartab)

# Synthetic

In [None]:
path = '/mnt/storage/Daniele/quicat_benchmark/dna/synthetic/'

In [None]:
ground_truth_path = '/mnt/storage/Daniele/clonal_toolkit_data/dna/synthetic/'
file_paths = [
    f'{ground_truth_path}synth_1_ground_truth.csv',
    f'{ground_truth_path}synth_2_ground_truth.csv',
    f'{ground_truth_path}synth_3_ground_truth.csv',
    f'{ground_truth_path}synth_4_ground_truth.csv'
]

In [None]:
dfs = []
barcodes_ground_truth = set()
for i, file_path in enumerate(file_paths, start=1):
    df = pd.read_csv(file_path)
    barcodes_ground_truth.update(list(df.barcode.values))

In [None]:
len(barcodes_ground_truth)

### bartab

In [None]:

bartab = pd.read_csv(f'{path}bartab/counts/all_counts_combined.tsv', delimiter = '\t',).set_index('Barcode').T
samples_bartab = bartab.index.str.replace(r'_S\d+_L\d+_R1_\d+', '', regex=True)
bartab['sample'] = samples_bartab
bartab = bartab.groupby('sample').sum()

In [None]:
sample_raw_counts = bartab.sum(axis=1)
bartab_frequencies = bartab.div(sample_raw_counts, axis=0)*100
barcodes_filtered_bartab = set(bartab_frequencies.columns[(bartab_frequencies >= 0.001).any(axis=0)])

### quicat

In [None]:
ad = quicat.read_dna(f'{path}quicat/barcodes_output.csv')

In [None]:
barcodes_filtered_quicat = set(ad.var_names)

### pycashier

In [None]:
samples = os.listdir(f'{path}pycashier/outs/')

In [None]:
dfs={}
for sample in samples:
    name = sample[:sample.find('_R1')]
    dfs[name] = pd.read_csv(f'{path}pycashier/outs/{sample}', delimiter='\t').set_index('barcode')

In [None]:
pycashier = pd.concat(dfs.values(), axis=1).fillna(0).T
pycashier['sample'] = [name for name in dfs.keys()]
pycashier = pycashier.groupby('sample').sum()

In [None]:
sample_raw_counts = pycashier.sum(axis=1)
pycashier_frequencies = pycashier.div(sample_raw_counts, axis=0)*100
barcodes_filtered_pycashier = set(pycashier.loc[:, (pycashier_frequencies > 0.001).any(axis=0)].columns)

## upset plot

In [None]:
barcode_sets = {
    'Ground Truth': barcodes_ground_truth,
    'Quicat': barcodes_filtered_quicat,
    'Bartab': barcodes_filtered_bartab,
    'Pycashier': barcodes_filtered_pycashier
}

all_barcodes = set.union(*barcode_sets.values())
memberships = [
    [set_name for set_name, barcode_set in barcode_sets.items() if barcode in barcode_set]
    for barcode in all_barcodes
]
data = from_memberships(memberships)

upset = UpSet(
    data,
    subset_size='sum',
    show_counts=True,
    element_size=None,  
    intersection_plot_elements=6,
    show_percentages=False,
    other_dots_color=.1,
    shading_color=0.01
    #facecolor='red',
)

fig = plt.figure(figsize=(10, 6))
upset.style_subsets(present=["Quicat", "Ground Truth"],  edgecolor="red", linewidth=1.5)
upset.plot(fig=fig)

plt.savefig(f'{reports_dir}figures/fig2/upset_plot_synth.pdf', dpi = dpi, bbox_inches='tight')


## plot venn

In [None]:
sets = {
    'Pycashier': barcodes_filtered_pycashier,
    'Quicat': barcodes_filtered_quicat,
    'Bartab': barcodes_filtered_bartab,
    'Ground Truth': barcodes_ground_truth,
}

venny4py(sets, dpi=300, out = f'{reports_dir}figures/fig2/', ext = 'dna_synth.pdf', colors = colors, line_width = .75, legend_cols = 4, size = 9)