In [1]:
import scanpy as sc
import scvelo as scv
import numpy as np
import pandas as pd
import anndata
import seaborn as sns
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, "../../..")

from paths import DATA_DIR

In [2]:
DATA_DIR

PosixPath('/nfs/homedirs/pala/scCFM/project_dir/data')

In [3]:
folder = DATA_DIR / "hein_et_al" / "rawdata" / "perturb_host" 

In [None]:
adata_genesxcells = sc.read_mtx(folder / "matrix.mtx")

Transpose gene x cell to cell x gene 

In [None]:
adata_genesxcells

In [None]:
adata = anndata.AnnData(adata_genesxcells.X.T)
adata.X.todense()

In [None]:
adata

**Read the metadata** 

In [None]:
barcode_path = folder / "barcodes.tsv"
cellidentity_path = folder / "cell_identities.csv"
genes_path = folder /"genes.tsv"

In [None]:
barcodes = pd.read_csv(barcode_path, sep="\t", header=None)
cellidentity = pd.read_csv(cellidentity_path)
genes = pd.read_csv(genes_path, sep="\t", header=None)
genes.columns = ["ENSMBL", "gene ID"]

Load the comprehensive metadata from the supplementary material of the paper

In [None]:
metadata = pd.read_excel(folder / "CRISPRi_perturb_host_processed.xlsx")
metadata.head()

**Information we have:** UMI_count, guide_UMI_count, time, guide identity, guide_target, guide_phenotype, interferon score 

Subset the cells based on the metadata file 

In [None]:
cells_to_keep = barcodes.loc[barcodes.isin(np.array(metadata.cell_barcode))[0]]
# Get indexes
cells_to_keep_idx = np.array(cells_to_keep.index)

In [None]:
adata = adata[cells_to_keep_idx,:]

Experimental time 

In [None]:
metadata.experimental_time = [float(time.split("h")[0]) for time in metadata.experimental_time]

In [None]:
np.unique(metadata.experimental_time)

In [None]:
adata.obs = metadata

Fix the vars

In [None]:
adata.var = pd.DataFrame(genes)
adata.var.index = adata.var["gene ID"]

In [None]:
adata.var["mt"] = adata.var_names.str.startswith("MT-") 

**Examine metadata**

In [None]:
print(f"Number of targets {len(np.unique(metadata.guide_target))}")

In [None]:
np.unique(metadata.guide_target)

From paper method: GFP is control!!

In [None]:
np.unique(metadata.guide_phenotype)

In [None]:
highly_expressed_genes = np.array((adata.X.todense().sum(0)>10000).astype(np.bool)).squeeze(0)
print(f"Number of genes with more than 10k counts: {np.sum(highly_expressed_genes)}")

Use highly expressed genes as highly_variable key

In [None]:
adata.var["highly_variable"] = highly_expressed_genes

Normalize as explained in the paper

In [None]:
sc.pp.normalize_total(adata, np.mean(adata.X.sum(1)))

PCA and UMAP

In [None]:
# PCA
sc.tl.pca(adata, svd_solver="arpack", use_highly_variable=True)

In [None]:
sc.pp.neighbors(adata, n_pcs=40)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=["cluster", "experimental_time", "interferon_score", "viral_load"])

**Save the anndata**

In [None]:
processed_data_dir = DATA_DIR / "processed"

In [None]:
adata.write(processed_data_dir / "perturb_seq_host_factors.h5ad")