In [None]:
import pandas as pd

import numpy as np
import scanpy as sc

import pathlib as pl

import seaborn as sns

from tqdm.notebook import tqdm

import os

# Download single-cell data

In [None]:
adata = sc.read_h5ad("/add/path/here/full_cohort.h5ad")

In [None]:
marker_genes = {}
for cl in ["1","2","3","4","5"]:
    marker_genes[cl] = pd.read_csv(f"/add/path/here/cNMF_{cl}.csv",index_col=0)

In [None]:
refined_annotations = pd.read_csv("/add/path/here/refined_annotations.csv",index_col=0)
refined_annotations.columns = ["refined_annotations"]

In [None]:
highlevel_refined = {"Hepatocyte": "Epithelial", 
                     "Carcinoma": "Carcinoma", 
                     "Fibroblast": "Fibroblast", 
                     "Quiescent endothelial cells": "Endothelial", 
                     "Smooth muscle": "Muscle", 
                     "Skeletal muscle": "Muscle",
                     "TAM2": "Myeloid", "TAM3": "Myeloid",
                     "TCD4": "Lymphoid", 
                     "Inflammatory CAF": "Fibroblast", 
                     "Adipose CAF": "Fibroblast",
                     "HGF-CAF": "Fibroblast",
                     "TAM1": "Myeloid", 
                     "Myeloid-HighMT": "Unknown/technical", 
                     "Angiogenic EC": "Endothelial", 
                     "Quiescent EC": "Endothelial", 
                     "Venous EC": "Endothelial",
                     "TCD8": "Lymphoid", 
                     "B": "Lymphoid", 
                     "DC": "Myeloid", 
                     "Hepatic EC": "Endothelial", 
                     "Kupffer cells": "Myeloid", 
                     "NK": "Lymphoid", 
                     "Treg": "Lymphoid", 
                     "StrMus-HighMT": "Unknown/technical", 
                     "T-HighMT": "Unknown/technical", 
                     "Mast": "Myeloid", 
                     "Adipocytes": "Stromal/Muscle", 
                     "Endo-HighMT": "Unknown/technical"}

adata.obs = pd.concat([adata.obs,refined_annotations],axis=1)
adata.obs["highlevel_refined"] = adata.obs.refined_annotations.replace(highlevel_refined)

In [None]:
n_genes = 200

all_selgenes = {}
for sig in marker_genes:
    all_perc = []
    goi = pd.DataFrame(adata[:,marker_genes[sig].index[:n_genes]].X.toarray().copy(), index=adata.obs_names, columns=marker_genes[sig].index[:n_genes])
    goi = pd.concat([goi, adata.obs.highlevel_refined],axis=1)
    for gene in tqdm(marker_genes[sig].index[:n_genes]):
        df = pd.concat([(goi[gene]>0).astype(int),goi.highlevel_refined],axis=1)
        pc_counts = df.value_counts().unstack().T.drop("Unknown/technical")
        perc = pc_counts[1]/(pc_counts[0]+pc_counts[1])
        perc.name = gene
        all_perc.append(perc)
    
    all_perc = pd.concat(all_perc,axis=1).T
    
    selgenes = all_perc[(all_perc["Endothelial"]<0.1) & (all_perc["Fibroblast"]<0.1) & (all_perc["Lymphoid"]<0.1) & (all_perc["Muscle"]<0.1) & (all_perc["Myeloid"]<0.1)]
    all_selgenes[sig] = selgenes

In [None]:
resdir = pl.Path("/add/path/here/")
os.makedirs(resdir,exist_ok=True)

In [None]:
for cl in all_selgenes:
    pd.Series(all_selgenes[cl].index).to_csv(resdir / f"cNMF_{cl}.csv")