# Kang et. al dataset preparation (From raw data to h5ad)

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
sc.set_figure_params(dpi=200)

In [3]:
count_data = pd.read_csv("./data/kang/kang_count.csv")
count_data = count_data.transpose()
count_data.head()

Unnamed: 0,AL627309.1,RP11-206L10.2,LINC00115,NOC2L,KLHL17,PLEKHN1,HES4,ISG15,AGRN,C1orf159,...,AJ006998.2,CHODL,AP000476.1,LINC00158,AF131217.1,AP001437.1,TMPRSS3,AP001626.1,AP001062.7,LRRC3DN
AAACATACATTTCC.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATACCAGAAA.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATACCTCGCT.1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATACCTGGTA.1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
AAACATACGATGAA.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
meta_data = pd.read_csv("./data/kang/kang_meta.csv")
meta_data.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,stim,seurat_annotations,integrated_snn_res.0.5,seurat_clusters
AAACATACATTTCC.1,IMMUNE_CTRL,3017,877,CTRL,CD14 Mono,0,0
AAACATACCAGAAA.1,IMMUNE_CTRL,2481,713,CTRL,CD14 Mono,0,0
AAACATACCTCGCT.1,IMMUNE_CTRL,3420,850,CTRL,CD14 Mono,0,0
AAACATACCTGGTA.1,IMMUNE_CTRL,3156,1109,CTRL,pDC,11,11
AAACATACGATGAA.1,IMMUNE_CTRL,1868,634,CTRL,CD4 Memory T,2,2


In [5]:
adata = sc.AnnData(X=count_data)
adata.obs = meta_data.copy(deep=True)
adata

AnnData object with n_obs × n_vars = 13999 × 14053 
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'stim', 'seurat_annotations', 'integrated_snn_res.0.5', 'seurat_clusters'

In [6]:
adata = adata[~adata.obs['seurat_annotations'].isin(['Mk', 'pDC', 'Eryth'])]
adata

View of AnnData object with n_obs × n_vars = 13576 × 14053 
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'stim', 'seurat_annotations', 'integrated_snn_res.0.5', 'seurat_clusters'

In [7]:
adata.obs['condition'] = adata.obs['stim'].values
adata.obs['cell_type'] = adata.obs['seurat_annotations'].values

Trying to set attribute `.obs` of view, making a copy.


In [8]:
adata.obs['cell_type'] = adata.obs['cell_type'].replace("CD4 Naive T", "CD4 T")
adata.obs['cell_type'] = adata.obs['cell_type'].replace("CD4 Memory T", "CD4 T")
adata.obs['cell_type'] = adata.obs['cell_type'].replace("T activated", "T")
adata.obs['cell_type'] = adata.obs['cell_type'].replace("B Activated", "B")

In [9]:
adata.obs['cell_type'].value_counts()

CD14 Mono    4362
CD4 T        4266
B            1366
CD16 Mono    1044
CD8 T         814
T             633
NK            619
DC            472
Name: cell_type, dtype: int64

In [10]:
adata.X.min(), adata.X.max()

(0.0, 3828.0)

In [45]:
adata.write_h5ad("./data/kang/kang_count.h5ad")

... storing 'orig.ident' as categorical
... storing 'stim' as categorical
... storing 'seurat_annotations' as categorical
... storing 'condition' as categorical
... storing 'cell_type' as categorical


In [46]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)

In [49]:
adata.X.min(), adata.X.max()

(0.0, 6.941648)

In [48]:
adata.write_h5ad("./data/kang/kang_normalized.h5ad")