In [1]:
import scanpy as sc
import scvelo as scv
import numpy as np
import pandas as pd
import anndata
import seaborn as sns
import matplotlib.pyplot as plt

from PerturbSeq_CMV.paths import DATA_DIR

In [2]:
folder = DATA_DIR / "rawdata" / "CRISPRn_perturb_virus_host_processed"

In [None]:
adata_genesxcells = sc.read_mtx(folder / "matrix.mtx")

Transpose

In [None]:
adata = anndata.AnnData(adata_genesxcells.X.T)
adata.X.todense()

**Read metadata**

In [None]:
barcode_path = folder / "barcodes.tsv"
cellidentity_path = folder / "cell_identities.csv"
genes_path = folder /"genes.tsv"

In [None]:
barcodes = pd.read_csv(barcode_path, sep="\t", header=None)
cellidentity = pd.read_csv(cellidentity_path)
genes = pd.read_csv(genes_path, sep="\t", header=None)
genes.columns = ["ENSMBL", "gene ID"]

In [None]:
metadata = pd.read_excel(folder / "CRISPRn_perturb_virus_host_processed.xlsx")
metadata.head()

Information is both for viral and human reads 

Keep only a fraction of cells 

In [None]:
cells_to_keep = barcodes.loc[barcodes.isin(np.array(metadata.cell_barcode))[0]]
# Get indexes
cells_to_keep_idx = np.array(cells_to_keep.index)

In [None]:
adata = adata[cells_to_keep_idx,:]

In [None]:
adata.obs = metadata

Turn time into float 

In [None]:
metadata.experimental_time = [float(time.split('h')[0]) for time in metadata.experimental_time]

Set metadata df as obs

In [None]:
adata.var = pd.DataFrame(genes)
adata.var.index = adata.var["gene ID"]
adata.var["mt"] = adata.var_names.str.startswith("MT-") 

**Examine metadata**

In [None]:
print(f"Number of targets {len(np.unique(metadata.guide_target))}")

In [None]:
np.unique(np.unique(metadata.guide_target))

21 factors human, 31 factors virus plus 1 control each

In [None]:
np.unique(metadata.guide_phenotype)

Pre-process

In [None]:
highly_expressed_genes = np.array((adata.X.todense().sum(0)>10000).astype(np.bool)).squeeze(0)
print(f"Number of genes with more than 10k counts: {np.sum(highly_expressed_genes)}")
adata.var["highly_variable"] = highly_expressed_genes
sc.pp.normalize_total(adata, np.mean(adata.X.sum(1)))

PCA and UMAP

In [None]:
# PCA
sc.tl.pca(adata, svd_solver="arpack", use_highly_variable=True)

In [None]:
sc.pp.neighbors(adata, n_pcs=40)

In [None]:
sc.tl.umap(adata,  min_dist=0.1, spread=1.5)

In [None]:
sc.pl.umap(adata, color=["cluster", "experimental_time", "interferon_score", "viral_load"])

**Save the anndata**