# 01 - Preprocess

In [None]:

import scanpy as sc, pandas as pd, numpy as np, matplotlib.pyplot as plt
adata = sc.read_h5ad("data/cleaned_processed_frogtail.h5ad")
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
if 'mt' not in adata.var.columns:
    adata.var['mt'] = adata.var_names.str.upper().str.startswith(('MT-','MT.'))
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor='seurat_v3')
adata = adata[:, adata.var.highly_variable].copy()
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)
adata.write_h5ad("results/preprocessed.h5ad")
keys = [k for k in ['tissue','batch','condition','time','stage','cell_type'] if k in adata.obs.columns]
sc.pl.umap(adata, color=keys[:1] if keys else None, show=False)
plt.savefig("results/figures/umap_prelim.png", bbox_inches="tight")
print("ok")
