# Human PBMCs - Villani et al. dataset 

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import glob

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [2]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

results_file = './write/Villani_hum_PBMCs_pp.h5ad'

scanpy==1.4.4+40.gbd5f862 anndata==0.6.22.post1 umap==0.3.9 numpy==1.15.4 scipy==1.3.0 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.10.1


In [4]:
%%R
# Load all the R libraries we will be using in the notebook
library(scran)

## Load 
Here we load the pre-processed datasets (which has been annotated), and the raw matrices (which won't be filtered on the gene level). 

### Raw data

In [3]:
# Set up data loading
file = '../Munich/datasets/human/Villani/GSE94820_raw_expMatrix_DCnMono_discovery_set_submission.txt.gz'
adata_raw = sc.read(filename=file, cache=True, delimiter='\t', first_column_names=True)
adata_raw = adata_raw.transpose()
adata_raw.var_names_make_unique()
adata_raw.shape

... reading from cache file cache/..-Munich-datasets-human-Villani-GSE94820_raw_expMatrix_DCnMono_discovery_set_submission.h5ad


(1140, 26593)

In [4]:
adata_raw.obs.head()

CD141_P10_S73
CD141_P10_S74
CD141_P10_S75
CD141_P10_S76
CD141_P10_S77


In [5]:
adata_raw.var.head()

1/2-SBSRNA4
5S_RRNA
5_8S_RRNA
7SK
A1BG


In [6]:
adata_raw.var.index.names = ['gene_symbol']

In [7]:
#Annotate data
adata_raw.obs.index.rename('barcode', inplace=True)

adata_raw.obs['batch'] = ['Villani']*adata_raw.n_obs
adata_raw.obs['study'] = ['Villani']*adata_raw.n_obs
adata_raw.obs['chemistry'] = ['smart-seq2']*adata_raw.n_obs
adata_raw.obs['tissue'] = ['PBMCs']*adata_raw.n_obs
adata_raw.obs['species'] = ['Human']*adata_raw.n_obs
adata_raw.obs['data_type'] = ['TPM']*adata_raw.n_obs
adata_raw.obs.head()

Unnamed: 0_level_0,batch,study,chemistry,tissue,species,data_type
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CD141_P10_S73,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S74,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S75,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S76,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S77,Villani,Villani,smart-seq2,PBMCs,Human,TPM


In [8]:
adata_obs = adata_raw.obs.reset_index()
adata_obs['barcode'] = adata_obs['barcode']+['-Villani']*adata_raw.n_obs
adata_obs.set_index('barcode', inplace = True)
adata_raw.obs = adata_obs
adata_raw.obs.head()

Unnamed: 0_level_0,batch,study,chemistry,tissue,species,data_type
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CD141_P10_S73-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S74-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S75-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S76-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM
CD141_P10_S77-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM


### Pre-processed data

In [9]:
file_paths = '../PBMC_human/write/Villani_PBMCs_1.h5ad'
adata_pp = sc.read(file_paths, cache=True)
adata_pp.shape

(1022, 15329)

In [10]:
adata_pp.obs.head()

Unnamed: 0_level_0,n_genes,louvain_r1,cell_type,final_annotation
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD141_P10_S73,5811,3,CD141,Monocyte-derived dendritic cells
CD141_P10_S75,5899,3,CD141,Monocyte-derived dendritic cells
CD141_P10_S76,4178,3,CD141,Monocyte-derived dendritic cells
CD141_P10_S77,6128,3,CD141,Monocyte-derived dendritic cells
CD141_P10_S79,6342,3,CD141,Monocyte-derived dendritic cells


In [11]:
adata_obs = adata_pp.obs.reset_index()
adata_obs = adata_obs[['index', 'final_annotation', 'n_genes']].rename(columns = {'index':'barcode'})
adata_obs['barcode'] = adata_obs['barcode']+['-Villani']*adata_pp.n_obs
adata_obs.set_index('barcode', inplace = True)
adata_pp.obs = adata_obs
adata_pp.obs.head()

Unnamed: 0_level_0,final_annotation,n_genes
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1
CD141_P10_S73-Villani,Monocyte-derived dendritic cells,5811
CD141_P10_S75-Villani,Monocyte-derived dendritic cells,5899
CD141_P10_S76-Villani,Monocyte-derived dendritic cells,4178
CD141_P10_S77-Villani,Monocyte-derived dendritic cells,6128
CD141_P10_S79-Villani,Monocyte-derived dendritic cells,6342


In [12]:
# Restrict to cells that passed QC and were annotated
adata_obs_raw = adata_raw.obs.reset_index()
adata_obs_pp = adata_pp.obs.reset_index()
adata_merged = adata_obs_raw.merge(adata_obs_pp, on='barcode', how='left')
adata_merged.set_index('barcode', inplace = True)
adata_raw.obs = adata_merged
adata_raw.obs.head()

Unnamed: 0_level_0,batch,study,chemistry,tissue,species,data_type,final_annotation,n_genes
barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CD141_P10_S73-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM,Monocyte-derived dendritic cells,5811.0
CD141_P10_S74-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM,,
CD141_P10_S75-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM,Monocyte-derived dendritic cells,5899.0
CD141_P10_S76-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM,Monocyte-derived dendritic cells,4178.0
CD141_P10_S77-Villani,Villani,Villani,smart-seq2,PBMCs,Human,TPM,Monocyte-derived dendritic cells,6128.0


In [13]:
adata_raw = adata_raw[~pd.isnull(adata_raw.obs['final_annotation'])]
adata_raw.shape

(1022, 26593)

### Normalization

In [14]:
# Exclude genes that are = 0 in all cells
#Filter genes:
print('Total number of genes: {:d}'.format(adata_raw.n_vars))

# Min 20 cells - filters out 0 count genes
sc.pp.filter_genes(adata_raw, min_cells=1)
print('Number of genes after cell filter: {:d}'.format(adata_raw.n_vars))

Total number of genes: 26593
filtered out 4272 genes that are detectedin less than 1 cells


Trying to set attribute `.var` of view, making a copy.


Number of genes after cell filter: 22321


In [15]:
#Keep the count data in a counts layer --- here we have TPM but we save it as counts to run SCVI 
adata_raw.layers["counts"] = adata_raw.X.copy()

In [16]:
#Normalize adata 
sc.pp.log1p(adata_raw)

In [17]:
# Save final merged object
adata_raw.write(results_file)

... storing 'batch' as categorical
... storing 'study' as categorical
... storing 'chemistry' as categorical
... storing 'tissue' as categorical
... storing 'species' as categorical
... storing 'data_type' as categorical
