In [None]:
import gc
import os
import sys
import glob
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse
import scvelo as scv
import anndata as ad
import scipy.io as sio
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()

In [None]:
import session_info
session_info.show()

In [None]:
working_dir = ""
tscp_paths = glob.glob("")
subs = [*range(1,3)]

tscp_unzipped = []
# Unzip tscp file, if not already done
for path in tscp_paths:
    if not os.path.exists(path.replace(".gz","")):
        os.system(f"sudo pigz -k -d {path}")
        tscp_unzipped.append(path.replace(".gz",""))
    else:
        tscp_unzipped.append(path.replace(".gz",""))

print(tscp_unzipped)

In [None]:
def generate_splice_matrices(tscp_path, cutoff, adata_path):
    print(f"Reading in {tscp_path}")
    tscp_assign_df = dd.read_csv(tscp_path, blocksize="800MB")
    
    tscp_assign_df = tscp_assign_df.compute()
    cell_tscp_cnts = tscp_assign_df.groupby("bc_wells").size()
    cell_tscp_cnts = cell_tscp_cnts[cell_tscp_cnts >= cutoff]
    filtered_cell_dict = dict(zip(cell_tscp_cnts.index,np.zeros(len(cell_tscp_cnts))))
    
    def check_filtered_cell(cell_ind):
        try:
            filtered_cell_dict[cell_ind]
        except:
            return False
        else:
            return True
    
    genes = tscp_assign_df.gene_name.unique()
    bcs = cell_tscp_cnts.index
    gene_dict = dict(zip(genes,range(len(genes))))
    barcode_dict = dict(zip(bcs,range(len(bcs))))
    reads_to_keep = tscp_assign_df.bc_wells.apply(check_filtered_cell)
    
    print("\nFiltering tscp file..")
    tscp_assign_df_filt = tscp_assign_df[reads_to_keep]
    tscp_assign_df_filt["cell_index"] = tscp_assign_df_filt.bc_wells.apply(lambda s:barcode_dict[s])
    tscp_assign_df_filt["gene_index"] = tscp_assign_df_filt.gene_name.apply(lambda s:gene_dict[s])
    print("Done:", tscp_assign_df_filt.shape)
    
    rcv = tscp_assign_df_filt.query("exonic").groupby(["cell_index","gene_index"]).size().reset_index().values
    rows = list(rcv[:,0])+[len(barcode_dict)-1]
    cols = list(rcv[:,1])+[len(genes)-1]
    vals = list(rcv[:,2])+[0]
    X_exonic = scipy.sparse.csr_matrix((vals,(rows,cols)))
    
    rcv = tscp_assign_df_filt.query("~exonic").groupby(["cell_index","gene_index"]).size().reset_index().values
    rows = list(rcv[:,0])+[len(barcode_dict)-1]
    cols = list(rcv[:,1])+[len(genes)-1]
    vals = list(rcv[:,2])+[0]
    X_intronic = scipy.sparse.csr_matrix((vals,(rows,cols)))
    
    X = X_exonic + X_intronic
    adata = scv.AnnData(X=X,)
    
    x_row, x_col = adata.shape
    adata.obs = pd.DataFrame({"barcodes":bcs}, index=bcs)
    adata.var = pd.DataFrame({"gene":genes,"gene_name":genes})
    adata.var.index = genes
    
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    adata.layers["spliced"] = X_exonic
    adata.layers["unspliced"] = X_intronic
    scv.utils.show_proportions(adata)
    
    adata.obs.index = adata.obs.index.astype(str)
    adata.var.index = adata.var.index.astype(str)
    
    print(adata)
    
    adata.write_h5ad("")
    print(f"")
    
    return(adata)

In [None]:
ad_list_sp = []
tscp_cutoffs = ["XXXX","XXXX"]
for i in range(len(tscp_paths)):
    ad_list_sp.append(generate_splice_matrices(tscp_unzipped[i], tscp_cutoffs[i], f"{working_dir}"))

In [None]:
# Concatenate objects
ad_splice = ad.concat(ad_list_sp, keys=subs,index_unique="__s")

In [None]:
wt = adata[adata.obs["plaque"].isin(["P1", "P2", "etc."])]
wt

In [None]:
common_bcs = [x for x in ad_splice.obs.index.to_list() if x in wt.obs.index.to_list()]

ad_splice = ad_splice[ad_splice.obs.index.isin(common_bcs)]

In [None]:
common = ad_splice.obs.join(other=wt.obs, on=ad_splice.obs.index, how="left")

In [None]:
ad_splice.obs = common

In [None]:
ad_splice.index.name = None

In [None]:
# process and plot

In [None]:
scv.settings.verbosity = 3 # show errors(0), warnings(1), info(2), hints(3)
scv.pp.filter_genes(ad_splice, min_shared_counts=10)
scv.pp.normalize_per_cell(ad_splice)
scv.pp.filter_genes_dispersion(ad_splice, n_top_genes=3000)
scv.pp.log1p(ad_splice)
scv.pp.remove_duplicate_cells(ad_splice)
scv.pp.moments(ad_splice, n_pcs=50, n_neighbors=30)

sc.tl.umap(ad_splice)
scv.tl.velocity(ad_splice, mode="stochastic")
scv.tl.velocity_graph(ad_splice)

In [None]:
# fix umap colors

fine_colors = {
    'CD8+ Tem': "#1f77b4",             # Blue
    'CD8+ Trm': "#ff7f0e",             # Orange
    'CD4+/CD8+ Trm/exh': "#2ca02c",    # Green
    'cDC1': "#d62728",                 # Red
    'Intermed. Monocytes': "#9467bd",  # Purple
    'Prolif.': "#8c564b",              # Brown
    'TREM2+ Macro.': "#e377c2",        # Pink
    'Naive B Cell': "#7f7f7f",         # Grey
    'CD4+ Tnaive': "#bcbd22",          # Yellow-Green
    'CD16- NK': "#17becf",             # Cyan
    'CD8+ Temra': "#393b79",           # Dark Blue
    'C1Q+ Macro.': "#ffbb78",          # Light Orange
    'Plasmablasts': "#98df8a",         # Light Green
    'cDC2': "#ff9896",                 # Light Red
    'SMCs': "#c5b0d5",                 # Lavender
    'Classical Monocytes': "#c49c94",  # Light Brown
    'CD8+ Tnaive': "#ffdfea",          # Light Pink
    'CD16+ NK': "#c7c7c7",             # Light Grey
    'CD4+ Treg': "#000000",            # Black
    'CD4+ Teff': "#9edae5",            # Light Cyan
    'Mast Cells': "#2760d6",           # Dark Blue
    'Neutrophils': "#e7ba52",          # Golden Yellow
    'Activated B Cell': "#31a354",     # Forest Green
    'pDCs': "#FF5733",                 # Crimson
    'Non. Switched Mem. B Cell': "#ffce1b",   # Mustard
    'Switched Mem. B Cell': "#FFC0CB",        # Pink
    'Trans. B Cell':"#012169",                  # dark Blue
    'Endothelial': "#e77b7b"           # Light Red
}

In [None]:
ad_splice_sorted = ad_splice[ad_splice.obs["type"].isin(["sorted"])]
ad_splice_unsorted = ad_splice[ad_splice.obs["type"].isin(["unsorted"])]

In [None]:
# Set figure parameters
scv.settings.presenter_view = True # set max width size for presenter view
scv.set_figure_params("scvelo") # for beautified visualization
scv.set_figure_params(figsize=(6,4), dpi=150, format="png", dpi_save=300, transparent=False, facecolor="white", fontsize=8)

import seaborn as sns
cluster_colors=sns.color_palette("hls", 28)
scv.pl.velocity_embedding_stream(ad_splice, basis="umap", color="fine_clustering", palette=fine_colors, 
                                 size=10 ,alpha=0.8 ,fontsize=10, save="stream_embedding", legend_loc="right",
                                title = "RNA Velocity - All")

In [None]:
cluster_colors=sns.color_palette("hls", 28)
scv.pl.velocity_embedding_stream(ad_splice_sorted, basis="umap", color="fine_clustering", palette=fine_colors, 
                                 size=10 ,alpha=0.8 ,fontsize=10, save="stream_embedding", legend_loc="right",
                                title = "RNA Velocity - FACS")

In [None]:
cluster_colors=sns.color_palette("hls", 28)
scv.pl.velocity_embedding_stream(ad_splice_unsorted, basis="umap", color="fine_clustering", palette=fine_colors, 
                                 size=10 ,alpha=0.8 ,fontsize=10, save="stream_embedding", legend_loc="right",
                                title = "RNA Velocity - Bead Enriched")

In [None]:
# confirm counts
print(ad_splice)
print(ad_splice_sorted)
print(ad_splice_unsorted)