In [1]:
import numpy as np
import scanpy.api as sc
from scipy.sparse import csr_matrix
import logging
import pandas as pd
import combat2
import combat
import patsy

##################
# Configure file #
##################
sc.settings.verbosity = 2
sc.settings.autoshow = False
logging.basicConfig(level=logging.INFO)


In [2]:
savepath = '/Users/yimmieg/Downloads/CLUESImmVar_processed.V6.h5ad'
filepath = '/Users/yimmieg/Downloads/CLUESImmVar_nonorm.V6.h5ad'

In [3]:

####################
# Basic processing #
####################
adata = sc.read(filepath)
adata.obs['well'] = adata.obs['well'].astype('category')
adata.var_names_make_unique()
logging.info(str('Data structure details: ' + str(adata)))
logging.info('Removing Erythrocytes.')
mat = csr_matrix(adata.X)
mat = mat[:, adata.var_names.isin(['HBB'])].todense()
adata = adata[np.ravel(mat <= 1)]
logging.info(str('Data structure details: ' + str(adata)))
# Extract list of genes
genelist = adata.var_names.tolist()
# Find mitochondrial genes
mito_genes_names = [gn for gn in genelist if gn.startswith('MT-')]
logging.info(str('Mito genes: ' + str(mito_genes_names)))
# Find indices of mitochondrial genes
mito_genes = [genelist.index(gn) for gn in mito_genes_names]
# For each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mito'] = np.ravel(np.sum(adata[:, mito_genes].X, axis=1)) / np.ravel(np.sum(adata.X, axis=1))
# Add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1))
# Clinical variates
diseasecovpath = 'v2.clinical.data.txt'
clinic_cov = pd.read_csv(diseasecovpath, sep="\t")
# Fix name to make it compatible with clinical variates
adata.obs['ind_cov'] = adata.obs['ind_cov'].astype('object')
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1221'])] = '1221_1221'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1251'])] = '1251_1251'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1891'])] = '1891_1891'
indlist = np.unique(np.asarray(adata.obs['ind_cov'].tolist()))
# Add SLEDAI scores as covariate
logging.info('Add SLEDAI scores as a covariate.')
adata.obs['SLEDAI'] = adata.obs['ind_cov'].tolist()
for ii in range(len(indlist)):
    if adata.obs['disease_cov'][adata.obs['ind_cov'] == indlist[ii]][0] == 'healthy':
        adata.obs['SLEDAI'][adata.obs['ind_cov'] == indlist[ii]] = '0'
    else:
        score = clinic_cov['sledaiscore'][clinic_cov['genotypeid'].isin([indlist[ii]])].values.tolist()[0]
        adata.obs['SLEDAI'][adata.obs['ind_cov'] == indlist[ii]] = str(score)

logging.info('Add whether or not sequencing was performed at the Broad Institute as a covariate.')
# Get list of batches
batch_list = adata.obs['batch_cov'].tolist()
# Preallocate int array w/ size of batch_cov
broad = np.zeros_like(np.asarray(batch_list))
# if immvar is in batch name, assign 1.
for ii in range(len(batch_list)):
    if 'immvar' in batch_list[ii]:
        broad[ii] = 1
    else:
        broad[ii] = 0
# Make obs for Broad indication
adata.obs['Broad'] = np.asarray(broad, dtype=np.float32)

logging.info('Add gender as a covariate.')
individuals_list = adata.obs['ind_cov'].tolist()
female = np.zeros_like(np.asarray(individuals_list))
for ii in range(len(individuals_list)):
    if 'IGT' in individuals_list[ii]:
        female[ii] = 1
    else:
        if clinic_cov['female'][clinic_cov['genotypeid'].isin([individuals_list[ii]])].values.tolist()[0] == 1:
            female[ii] = 1
        else:
            female[ii] = 0
# Make obs for Broad indication
adata.obs['Female'] = np.asarray(female, dtype=np.float32)

logging.info('Filtering cells')
# Filter cells that have more than 10% of counts coming from mitochondrial genes.
adata = adata[adata.obs['percent_mito'] < 0.10]
logging.info(str('Data structure details: ' + str(adata)))
# Filter cells with abnormally low gene counts, high gene counts.
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_cells(adata, max_genes=2500)
sc.pp.filter_genes(adata, min_cells=200)
logging.info(str('Data structure details: ' + str(adata)))
logging.info('Saving raw counts')
adata.uns['barcodes'] = adata.obs_names.tolist()
adata.uns['genes'] = adata.var_names.tolist()
adata.uns['raw_counts'] = adata.X
logging.info('Normalizing total counts to 10,000')
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
logging.info('Log transforming data')
sc.pp.log1p(adata)
logging.info('Saving log(counts)+1 in .raw')
adata.raw = adata



INFO:root:Data structure details: AnnData object with n_obs × n_vars = 834096 × 32738 
    obs: 'disease_cov', 'ct_cov', 'pop_cov', 'ind_cov', 'well', 'batch_cov', 'batch'
    var: 'gene_ids-0-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0', 'gene_ids-1-0-0-0-0', 'gene_ids-1-0-0-0', 'gene_ids-1-0-0', 'gene_ids-1-0', 'gene_ids-1'
INFO:root:Removing Erythrocytes.
INFO:root:Data structure details: View of AnnData object with n_obs × n_vars = 832490 × 32738 
    obs: 'disease_cov', 'ct_cov', 'pop_cov', 'ind_cov', 'well', 'batch_cov', 'batch'
    var: 'gene_ids-0-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0', 'gene_ids-1-0-

filtered out 1828 cells that have more than  2500 genes expressed
filtered out 14548 genes that are detected in less than 100 cells


INFO:root:Data structure details: AnnData object with n_obs × n_vars = 819878 × 18190 
    obs: 'disease_cov', 'ct_cov', 'pop_cov', 'ind_cov', 'well', 'batch_cov', 'batch', 'percent_mito', 'n_counts', 'SLEDAI', 'Broad', 'Female', 'n_genes'
    var: 'gene_ids-0-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0', 'gene_ids-1-0-0-0-0', 'gene_ids-1-0-0-0', 'gene_ids-1-0-0', 'gene_ids-1-0', 'gene_ids-1', 'n_cells'
INFO:root:Saving raw counts
INFO:root:Normalizing total counts to 10,000
INFO:root:Log transforming data
INFO:root:Saving log(counts)+1 in .raw


In [4]:
adata.shape

(819878, 18190)

In [None]:

logging.info('Running combat')
mod = patsy.dmatrix("~ disease_cov", adata.obs, return_type="dataframe")
mod = mod.reset_index(drop=True)
batch = adata.obs['batch_cov']
batch = batch.reset_index(drop=True)
adata.X = combat.combat(adata.X.transpose().toarray(), batch=batch, model=mod).transpose();

# Add platelet genes for purposes of regressing out their signature
adata.obs['PF4'] = adata.raw[:,"PF4"].X
adata.obs['SDPR'] = adata.raw[:,"SDPR"].X
adata.obs['GNG11'] = adata.raw[:,"GNG11"].X
adata.obs['PPBP'] = adata.raw[:,"PPBP"].X
logging.info('Making .obs into categories')
adata.strings_to_categoricals()
logging.info('Filtering genes')
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var['highly_variable']]
logging.info(str('Data structure details: ' + str(adata)))

sc.pp.scale(adata, max_value=10)

INFO:root:Running combat
found 14 batches
found 0 numerical covariates...
found 1 categorical variables:	disease_cov[T.sle]
Standardizing Data across genes.


In [None]:
## compute PCA
sc.pp.pca(adata, random_state=1, svd_solver='arpack')

## removing the two components most highly correlated with platelet counts
to_remove = pd.Series()
for ii in range(len(adata.obsm.X_pca[1,:])) :
    logging.info(str(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1]))
    if(np.absolute(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1]) > 0.1):
        adata.obs["PC"+str(ii)] = adata.obsm.X_pca[:,ii];
        to_remove = to_remove.append(pd.Series(["PC"+str(ii)]))
        logging.info(str(ii))
        logging.info(str(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])))


##batch_cov = pd.get_dummies(adata.obs.batch_cov)
##adata.obs = pd.concat([adata.obs, batch_cov], axis=1)
regressors = ['n_counts', 'percent_mito', 'Female']+to_remove.tolist();

logging.info('Regressing out total nUMIs, percentage mitochondrial UMIs, gender, platelet signature and pool')
logging.info('regressors: '+str(regressors))

sc.pp.regress_out(adata, regressors)

##sc.pp.regress_out(adata, ['batch_cov']);

logging.info('Scaling expression data')
sc.pp.scale(adata, max_value=10)


# ## combat adjustment                                                                                                                                                                                                           
# logging.info('Running combat')
# mod = patsy.dmatrix("~ disease_cov", adata.obs, return_type="dataframe")
# mod = mod.reset_index(drop=True)
# batch = adata.obs['batch_cov']
# batch = batch.reset_index(drop=True)
# adata.X = combat.combat(adata.X.transpose(), batch=batch, model=mod).transpose();

##combat2.combat(adata, model=mod, key="batch_cov")
##sc.pp.combat(adata, key="batch_cov")


logging.info(str('Data structure details: ' + str(adata)))
logging.info('Removing samples that were processed twice...')
# Remove samples that were processed twice and get rid of their 1.10 run
remove = ['902289200_902289200', '1262_1262', '1270_1270', '1279_1279']
# Remove samples that were processed twice and get rid of their 8.9 run
# since the 8.9 run has the fewest number of cells, this adjustment is important for testing for cell proportion differences
# going forward, we will want to combine batches for the same individuals
remove2 = ['1472_1472', '1479_1479', '1480_1480', '1492_1492', '1522_1522', '1535_1535', '1602_1602', '1615_1615', '1621_1621', '1716_1716', '1726_1726', '1730_1730']
keep_indices = ~((adata.obs.ind_cov.isin(remove)) & (adata.obs.batch_cov == "lupus1.10"))
keep2_indices = ~((adata.obs.ind_cov.isin(remove2)) & (adata.obs.batch_cov == "lupus8.9"))
adata = adata[(keep_indices & keep2_indices)]
logging.info(str('Data structure details: ' + str(adata)))
# Unique list of individuals
people = np.unique(adata.obs['ind_cov'].values.tolist())
# Allocate space for total PMBCs per individual.
total_pbmcs = dict.fromkeys(people)
for p in people:
    total_pbmcs[p] = len(adata.obs_names[adata.obs['ind_cov'] == p])
adata.uns['total_pbmcs'] = total_pbmcs
logging.info('Saving processed data')
# Remove .var information we never use.
for key in list(adata.var.keys()):
    del adata.var[key]
logging.info(str('Structure details: ' + str(adata)))
adata.write(savepath, compression="gzip")


In [None]:
version = "V6"
name = "CLUESImmVar_processed."+version
processed_path = name+'.h5ad';
adata = sc.read(processed_path, cache=True)

In [None]:
#######################
# Louvain and friends #
#######################
# Set parameters
intialization = 1
n_components = 20
resolution = 3
# Run louvain clustering on theoretical future gene expression per cell
logging.info('Estimating louvain cluster identities for gene expression values.')
sc.pp.pca(adata, random_state=intialization, svd_solver='arpack')
logging.info('PCA complete.')
sc.pp.neighbors(adata, random_state=intialization)
logging.info('KNN complete.')
sc.tl.umap(adata, random_state=intialization)
logging.info('UMAP complete.')
adata.write(savepath, compression="gzip")

In [None]:
adata.obs['site'] = pd.Categorical(adata.obs.disease_cov, categories=['Broad','UCSF'])
adata.obs.site[adata.obs.batch_cov.str.contains("immvar")] = "Broad"
adata.obs.site[~adata.obs.batch_cov.str.contains("immvar")] = "UCSF"

adata.obs['disease_pop_site_cov'] = pd.Categorical(adata.obs.apply(lambda row: row['disease_cov'] + row['pop_cov'] + row['site'], axis=1),
                                             categories=['healthyWHITEBroad','healthyWHITEUCSF','sleWHITEUCSF','sleASIANUCSF'],
                                             ordered=True)

In [None]:
## since we have replicates, let's sort adata in a consistent way
adata_obs_sorted = adata.obs.sort_values(by=['disease_cov','ind_cov','batch_cov'], ascending=False);
adata = adata[adata_obs_sorted.index]

In [None]:
sc.pl.umap(adata, color="disease_pop_site_cov")

In [None]:
sc.pl.umap(adata[(adata.obs.disease_cov == "healthy")],color="disease_pop_site_cov")

In [None]:
sc.tl.diffmap(adata)
logging.info('diffmap complete.')
sc.tl.louvain(adata, random_state=30, resolution=3)
sc.tl.leiden(adata, random_state=30, resolution=3)
logging.info('Louvain complete.')
logging.info('Making .obs into categories')
adata.strings_to_categoricals()
adata.write(filepath, compression="gzip")
logging.info('Basic analysis complete.')