# Load data

## Set data file

In [None]:
# Purified datasets (GDT, CD4 T, CD8 T, NK)
gdt_3donors_f = 'data/gdt_3donors.tsv'
cd4t_f = "data/cd4t"
cd8t_d1_f = "data/cd8t_donor1"
cd8t_d2_f = "data/cd8t_donor2"
nk_f = "data/nk"
# PBMC dataset
pbmc_3p_f = "data/pbmc_2_batch/pbmc_batch1.h5ad"
pbmc_5p_f = "data/pbmc_2_batch/pbmc_batch2.h5ad"

  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


## Load data into AnnData

### Load PBMC 3p and 5p

In [None]:
import scanpy as sc


pbmc_3p = sc.read_h5ad(pbmc_3p_f)
pbmc_5p = sc.read_h5ad(pbmc_5p_f)

In [None]:
# See available cell types
print("PBMC 3' cell types:", pbmc_3p.obs['celltype'].unique())
print("PBMC 5' cell types:", pbmc_5p.obs['celltype'].unique())

PBMC 3' cell types: ['CD14+ monocyte', 'Dendritic cell', 'Cytotoxic T cell', 'CD16+ monocyte', 'Plasmacytoid dendritic cell', 'B cell', 'Natural killer cell', 'CD4+ T cell', 'Megakaryocyte']
Categories (9, object): ['B cell', 'CD4+ T cell', 'CD14+ monocyte', 'CD16+ monocyte', ..., 'Dendritic cell', 'Megakaryocyte', 'Natural killer cell', 'Plasmacytoid dendritic cell']
PBMC 5' cell types: ['B cell', 'Plasmacytoid dendritic cell', 'CD4+ T cell', 'Natural killer cell', 'Cytotoxic T cell', 'Megakaryocyte', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell']
Categories (9, object): ['B cell', 'CD4+ T cell', 'CD14+ monocyte', 'CD16+ monocyte', ..., 'Dendritic cell', 'Megakaryocyte', 'Natural killer cell', 'Plasmacytoid dendritic cell']


### Load purified datasets

In [None]:
import pandas as pd


# Convert tsv file to AnnData
def convert_tsv_to_anndata(file_path, cell_type):
    """Convert a TSV file to an AnnData object."""
    df = pd.read_csv(file_path, sep='\t', index_col=0)
    adata = sc.AnnData(df.transpose())
    adata.obs['celltype'] = cell_type
    return adata

# Convert 10x matrix folder to AnnData
def convert_10x_to_anndata(folder_path, cell_type):
    """
    Load a 10x-format matrix folder into AnnData.
    Works for both compressed (.gz) and uncompressed files.
    """
    adata = sc.read_10x_mtx(
        folder_path,
        var_names="gene_symbols",
        make_unique=True
    )
    
    adata.obs["celltype"] = cell_type
    return adata

In [None]:
# Convert purified datasets into AnnData
gdt_3donors = convert_tsv_to_anndata(gdt_3donors_f, 'GDT')
cd4t = convert_10x_to_anndata(cd4t_f, 'CD4 T cell')
cd8t_d1 = convert_10x_to_anndata(cd8t_d1_f, 'CD8 T cell')
cd8t_d2 = convert_10x_to_anndata(cd8t_d2_f, 'CD8 T cell')
nk = convert_10x_to_anndata(nk_f, 'Natural killer cell')

# Preprocessing

## Filter GDT-similar cells from PBMC data
Remove cell types with high similarity to GDT (CD4 T, CD8 T, NK cells) from PBMC data. We later replace them with purified datasets and purified GDT to prevent false negative from inaccurate annotation.

In [None]:
# See available cell types
print("PBMC 3' cell types:", pbmc_3p.obs['celltype'].unique())
print("PBMC 5' cell types:", pbmc_5p.obs['celltype'].unique())

In [None]:
# Remove all 'Cytotoxic T cell', 'CD4+ T cell', 'Natural killer cell'
remove_celltypes = ['Cytotoxic T cell', 'CD4+ T cell', 'Natural killer cell']
pbmc_3p_clean = pbmc_3p[~pbmc_3p.obs['celltype'].isin(remove_celltypes)]
pbmc_5p_clean = pbmc_5p[~pbmc_5p.obs['celltype'].isin(remove_celltypes)]

# Check if removal was successful
print("Cleaned PBMC 3' cell types:", pbmc_3p_clean.obs['celltype'].unique())
print("Cleaned PBMC 5' cell types:", pbmc_5p_clean.obs['celltype'].unique())

## Quality control

In [None]:
# Quality control function
def quality_control(
        adata,
        min_genes=200,
        max_genes=6000,
        min_cells=3,
        max_mt=10,
):
    adata_qc = adata.copy() 

    # Filter low-gene cells and rare genes
    sc.pp.filter_cells(adata_qc, min_genes=min_genes)
    sc.pp.filter_genes(adata_qc, min_cells=min_cells)
    # Mark mitochondrial genes (robust to case)
    adata_qc.var["mt"] = adata_qc.var_names.str.upper().str.startswith("MT-")

    # Compute QC metrics
    sc.pp.calculate_qc_metrics(
        adata_qc,
        qc_vars=["mt"],
        percent_top=None,
        log1p=False,
        inplace=True,
    )

    # Filter out likely doublets / weird cells with too many genes
    adata_qc = adata_qc[adata_qc.obs["n_genes_by_counts"] < max_genes, :].copy()

    # Filter out high-mito cells
    adata_qc = adata_qc[adata_qc.obs["pct_counts_mt"] < max_mt, :].copy()

    # Print final cell count
    print(f"{adata_qc.obs['celltype']} cells before QC: {adata.n_obs}, after QC: {adata_qc.n_obs}")

    return adata_qc

In [None]:
# Perform QC
pbmc_3p_clean_qc = quality_control(pbmc_3p_clean)
pbmc_5p_clean_qc = quality_control(pbmc_5p_clean)
gdt_3donors_qc = quality_control(gdt_3donors)
cd4t_qc = quality_control(cd4t)
cd8t_d1_qc = quality_control(cd8t_d1)
cd8t_d2_qc = quality_control(cd8t_d2)
nk_qc = quality_control(nk)

In [None]:
# Remove unused observations
keep_obs = "celltype"
pbmc_3p_clean_qc.obs = pbmc_3p_clean_qc.obs[[keep_obs]]
pbmc_5p_clean_qc.obs = pbmc_5p_clean_qc.obs[[keep_obs]]

# Add batch info
pbmc_3p_clean_qc.obs['batch'] = '3p'
pbmc_5p_clean_qc.obs['batch'] = '5p'

# Save cleaned datasets
pbmc_3p_clean_qc.write_h5ad("data/pbmc_3p_clean_qc.h5ad")
pbmc_5p_clean_qc.write_h5ad("data/pbmc_5p_clean_qc.h5ad")