# Mapping and classifying cells using Human Lung Cell Atlas

## Environment Setup

In [None]:
import os

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import scarches as sca
import gdown
import gzip
import shutil

ModuleNotFoundError: No module named 'scarches'

In [None]:
sc.logging.print_versions()

## Download Files - Reference model, Embeddings etc.

In [None]:
if not os.path.isdir("../data"):
    os.mkdir("../data")    

In [None]:
os.chdir('../data')

In [None]:
url = 'https://zenodo.org/record/6337966/files/HLCA_emb_and_metadata.h5ad'
output = 'HLCA_emb_and_metadata.h5ad'
gdown.download(url, output, quiet=False)

In [None]:
url = 'https://zenodo.org/record/6337966/files/HLCA_reference_model.zip'
output = 'HLCA_reference_model.zip'
gdown.download(url, output, quiet=False)
shutil.unpack_archive('HLCA_reference_model.zip')
os.remove(output)

## Download query data

In [None]:
query_data_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM5230027&format=file&file=GSM5230027%5F04%2DP103142%2DS149%2DR01%5Fraw%5Ffeature%5Fbc%5Fmatrix%2Eh5%2Egz"
query_metadata_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE171668&format=file&file=GSE171668%5Flung%5Fmetadata%2Ecsv%2Egz"

In [None]:
output = 'query.h5.gz'
gdown.download(query_data_url, output, quiet=False)
with gzip.open('query.h5.gz', 'rb') as f_in:
    with open('query.h5', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
os.remove("query.h5.gz")

In [None]:
output = 'query_obs.csv.gz'
gdown.download(query_metadata_url, output, quiet=False)

## Preprocessing

In [None]:
query_data_full = sc.read_10x_h5("query.h5")
    # clean up .var.index (gene names)
query_data_full.var['gene_names'] = query_data_full.var.index
query_data_full.var.index = [idx.split("___")[-1] for idx in query_data_full.var.gene_ids]
    # clean up cell barcodes:
query_data_full.obs.index = query_data_full.obs.index.str.rstrip("-1")
    # read in metadata (to select only cells of interest and remove empty drops)
query_data_meta = pd.read_csv("query_obs.csv.gz",index_col=0)
    # subset to cells from our sample
query_data_meta = query_data_meta.loc[query_data_meta.donor == "D12_4",:].copy()
    # clean up barcodes:
query_data_meta.index = [idx.split("-")[-1] for idx in query_data_meta.index]
    # subset adata to cells in metadata:
query_data_full = query_data_full[query_data_meta.index,:].copy()
    # add dataset information:
query_data_full.obs['dataset'] = 'test_dataset_delorey_regev'

In [None]:
# gene order for scArches model
reference_gene_order = pd.read_csv('HLCA_reference_model/var_names.csv')
# reference embedding, including cell/sample/subject metadata:
reference_embedding = sc.read_h5ad('HLCA_emb_and_metadata.h5ad')

In [None]:
ref_model_path = 'HLCA_reference_model'
query_data = sca.models.SCANVI.prepare_query_anndata(query_data_full, ref_model_path, inplace=False)

In [None]:
query_data.obs['scanvi_label'] = 'unlabeled'

## Perform surgery

In [None]:
batch_key = 'dataset'
query_batch = ['test_dataset_delorey_regev']

In [None]:
surgery_epochs = 500
early_stopping_kwargs_surgery = {
    "early_stopping_metric": "elbo",
    "save_best_state_metric": "elbo",
    "on": "full_dataset",
    "patience": 10,
    "threshold": 0.001,
    "reduce_lr_on_plateau": True,
    "lr_patience": 8,
    "lr_factor": 0.1,
}

Surgery

In [None]:
os.chdir('/notebooks')

In [None]:
surgery_model = sca.models.SCANVI.load_query_data(
        query_data,
        ref_model_path,
        freeze_dropout = True,
    )

In [None]:
# surgery_model._unlabeled_indices = np.arange(query_data.n_obs)
# surgery_model._labeled_indices = []

In [None]:
#TODO: Check args
surgery_model.train(
    max_epochs=surgery_epochs,
    **early_stopping_kwargs_surgery
)

In [None]:
surgery_path = 'surgery_model'
surgery_model.save(surgery_path, overwrite=True)

## ...

In [None]:
sca.utils.knn.weighted_knn_transfer()