# Load data
Load raw 10X Genomics data (https://www.10xgenomics.com/datasets) of:
* Two-batch (3' and 5') PBMC dataset
* Purified datasets of cell types close to GDT and GDT:
    * [CD4 T cells](https://www.10xgenomics.com/datasets/cd-4-plus-helper-t-cells-1-standard-1-1-0)
    * [CD8 T cells](https://www.10xgenomics.com/datasets/cd-8-plus-t-cells-of-healthy-donor-1-1-standard-3-0-2)
    * [NK cells](https://www.10xgenomics.com/datasets/cd-56-plus-natural-killer-cells-1-standard-1-1-0)
    * GDT from 3 donors (PNAS paper)

## Load data into AnnData

In [1]:
# Purified datasets (GDT, CD4 T, CD8 T, NK)
raw_data_path = "data/raw"
gdt_3donors_f = f'{raw_data_path}/gdt_3donors.tsv'
cd4t_f = f"{raw_data_path}/cd4t"
cd8t_d1_f = f"{raw_data_path}/cd8t_donor1"
cd8t_d2_f = f"{raw_data_path}/cd8t_donor2"
nk_f = f"{raw_data_path}/nk"
# PBMC dataset
pbmc_3p_f = f"{raw_data_path}/pbmc_2_batch/pbmc_batch1.h5ad"
pbmc_5p_f = f"{raw_data_path}/pbmc_2_batch/pbmc_batch2.h5ad"

### Load PBMC 3p and 5p

In [2]:
import scanpy as sc


pbmc_3p_ad = sc.read_h5ad(pbmc_3p_f)
pbmc_5p_ad = sc.read_h5ad(pbmc_5p_f)

  return dispatch(args[0].__class__)(*args, **kw)
  return dispatch(args[0].__class__)(*args, **kw)


### Load purified (CD4 T, CD8 T, NK, GDT) datasets

In [None]:
import data_functions as dfuncs
from importlib import reload
reload(dfuncs)


# Convert purified datasets into AnnData
gdt_3donors_ad = dfuncs.convert_tsv_to_anndata(gdt_3donors_f, 'GDT')
cd4t_ad = dfuncs.convert_10x_to_anndata(cd4t_f, 'CD4+ T cell')
cd8t_d1_ad = dfuncs.convert_10x_to_anndata(cd8t_d1_f, 'CD8+ T cell')
cd8t_d2_ad = dfuncs.convert_10x_to_anndata(cd8t_d2_f, 'CD8+ T cell')
nk_ad = dfuncs.convert_10x_to_anndata(nk_f, 'Natural killer cell')

# Preprocessing

## Filter GDT-similar cells from PBMC data
Remove cell types with high similarity to GDT (CD4 T, CD8 T, NK cells) from PBMC data. We later replace them with purified datasets and purified GDT to prevent false negative from inaccurate annotation.

In [4]:
# See available cell types
print("PBMC 3' cell types:", pbmc_3p_ad.obs['celltype'].unique())
print("PBMC 5' cell types:", pbmc_5p_ad.obs['celltype'].unique())

PBMC 3' cell types: ['CD14+ monocyte', 'Dendritic cell', 'Cytotoxic T cell', 'CD16+ monocyte', 'Plasmacytoid dendritic cell', 'B cell', 'Natural killer cell', 'CD4+ T cell', 'Megakaryocyte']
Categories (9, object): ['B cell', 'CD4+ T cell', 'CD14+ monocyte', 'CD16+ monocyte', ..., 'Dendritic cell', 'Megakaryocyte', 'Natural killer cell', 'Plasmacytoid dendritic cell']
PBMC 5' cell types: ['B cell', 'Plasmacytoid dendritic cell', 'CD4+ T cell', 'Natural killer cell', 'Cytotoxic T cell', 'Megakaryocyte', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell']
Categories (9, object): ['B cell', 'CD4+ T cell', 'CD14+ monocyte', 'CD16+ monocyte', ..., 'Dendritic cell', 'Megakaryocyte', 'Natural killer cell', 'Plasmacytoid dendritic cell']


In [5]:
# Remove all 'Cytotoxic T cell', 'CD4+ T cell', 'Natural killer cell'
remove_celltypes = ['Cytotoxic T cell', 'CD4+ T cell', 'Natural killer cell']
pbmc_3p_clean_ad = pbmc_3p_ad[~pbmc_3p_ad.obs['celltype'].isin(remove_celltypes)]
pbmc_5p_clean_ad = pbmc_5p_ad[~pbmc_5p_ad.obs['celltype'].isin(remove_celltypes)]

# Check if removal was successful
print("Cleaned PBMC 3' cell types:", pbmc_3p_clean_ad.obs['celltype'].unique())
print("Cleaned PBMC 5' cell types:", pbmc_5p_clean_ad.obs['celltype'].unique())

Cleaned PBMC 3' cell types: ['CD14+ monocyte', 'Dendritic cell', 'CD16+ monocyte', 'Plasmacytoid dendritic cell', 'B cell', 'Megakaryocyte']
Categories (6, object): ['B cell', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell', 'Megakaryocyte', 'Plasmacytoid dendritic cell']
Cleaned PBMC 5' cell types: ['B cell', 'Plasmacytoid dendritic cell', 'Megakaryocyte', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell']
Categories (6, object): ['B cell', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell', 'Megakaryocyte', 'Plasmacytoid dendritic cell']


## Quality control

In [6]:
# Perform QC
pbmc_3p_clean_qc_ad = dfuncs.quality_control(pbmc_3p_clean_ad)
pbmc_5p_clean_qc_ad = dfuncs.quality_control(pbmc_5p_clean_ad)
gdt_3donors_qc_ad = dfuncs.quality_control(gdt_3donors_ad)
cd4t_qc_ad = dfuncs.quality_control(cd4t_ad)
cd8t_d1_qc_ad = dfuncs.quality_control(cd8t_d1_ad)
cd8t_d2_qc_ad = dfuncs.quality_control(cd8t_d2_ad)
nk_qc_ad = dfuncs.quality_control(nk_ad)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


['CD14+ monocyte', 'Dendritic cell', 'CD16+ monocyte', 'Plasmacytoid dendritic cell', 'B cell', 'Megakaryocyte']
Categories (6, object): ['B cell', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell', 'Megakaryocyte', 'Plasmacytoid dendritic cell'] cells before QC: 1332, after QC: 1053
['B cell', 'Plasmacytoid dendritic cell', 'Megakaryocyte', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell']
Categories (6, object): ['B cell', 'CD14+ monocyte', 'CD16+ monocyte', 'Dendritic cell', 'Megakaryocyte', 'Plasmacytoid dendritic cell'] cells before QC: 1097, after QC: 815
['GDT'] cells before QC: 8202, after QC: 8163
['GDT'] cells before QC: 8202, after QC: 8163
['CD4 T cell'] cells before QC: 11213, after QC: 11209
['CD4 T cell'] cells before QC: 11213, after QC: 11209
['CD8 T cell'] cells before QC: 55206, after QC: 38364
['CD8 T cell'] cells before QC: 55206, after QC: 38364
['CD8 T cell'] cells before QC: 91921, after QC: 72747
['CD8 T cell'] cells before QC: 91921, after QC: 72747
['N

In [7]:
keep_obs = "celltype"
pbmc_3p_clean_qc_ad.obs = pbmc_3p_clean_qc_ad.obs[[keep_obs]]
pbmc_5p_clean_qc_ad.obs = pbmc_5p_clean_qc_ad.obs[[keep_obs]]

# Add batch info
pbmc_3p_clean_qc_ad.obs['batch'] = '3p'
pbmc_5p_clean_qc_ad.obs['batch'] = '5p'

# Save cleaned datasets
dfuncs.save_h5ad(pbmc_3p_clean_qc_ad, "pbmc_3p_clean_qc.h5ad")
dfuncs.save_h5ad(pbmc_5p_clean_qc_ad, "pbmc_5p_clean_qc.h5ad")
dfuncs.save_h5ad(gdt_3donors_qc_ad, "gdt_3donors_qc.h5ad")
dfuncs.save_h5ad(cd4t_qc_ad, "cd4t_qc.h5ad")
dfuncs.save_h5ad(cd8t_d1_qc_ad, "cd8t_d1_qc.h5ad")
dfuncs.save_h5ad(cd8t_d2_qc_ad, "cd8t_d2_qc.h5ad")
dfuncs.save_h5ad(nk_qc_ad, "nk_qc.h5ad")

Saved data/processed/pbmc_3p_clean_qc.h5ad
Saved data/processed/pbmc_5p_clean_qc.h5ad
Saved data/processed/pbmc_5p_clean_qc.h5ad
Saved data/processed/gdt_3donors_qc.h5ad
Saved data/processed/gdt_3donors_qc.h5ad
Saved data/processed/cd4t_qc.h5ad
Saved data/processed/cd4t_qc.h5ad
Saved data/processed/cd8t_d1_qc.h5ad
Saved data/processed/cd8t_d1_qc.h5ad
Saved data/processed/cd8t_d2_qc.h5ad
Saved data/processed/cd8t_d2_qc.h5ad
Saved data/processed/nk_qc.h5ad
Saved data/processed/nk_qc.h5ad
