# 1. Let's import a bunch of stuff

In [1]:
import sys
import seaborn as sb
import scanpy.api as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import preprocessing
from sklearn.preprocessing import MaxAbsScaler
import argparse
import logging
import gc
from skbio.stats.composition import clr
%matplotlib inline
# Savefile path
savepath = 'V6.5.flareAb.h5ad'
#savepath = 'V6.flareAb_withDoublets.h5ad'
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)



# Import data file

In [2]:
df = pd.read_excel('flare.covars.again.xlsx')
df.head(300)

Unnamed: 0,RUN_WELL,ID,ALIAS,SOURCE,STATUS,TR/UNT,PAIR
0,1_1,0_904194200_904194200,"FLARE2,3,or8",UCSF,FLARE,"Unknown (2 TR, 1 UNT)",True
1,1_1,0_1716_1716,-,-,-,-,-
2,1_1,0_IGTB256,IGTB256,IMMVAR,HEALTHY,-,-
3,1_1,0_1892_1892,-,-,-,-,-
4,1_1,0_900805200_900805200,1804_1804,CLUES,HEALTHY,-,-
5,1_1,0_904236200_904236200,-,-,-,-,-
6,1_1,0_Criswell_Ye_E04_1941DNA01_Criswell_Ye_E04_19...,-,-,-,-,-
7,1_1,0_1754_1754,1754_1754,CLUES,HEALTHY,-,-
8,1_1,0_IGTB1921,IGTB1921,IMMVAR,HEALTHY,-,-
9,1_1,0_1763_1763,"FLARE2,3,or8",UCSF,FLARE,"Unknown (2 TR, 1 UNT)",True


# Combine flare cohorts

In [3]:
# Flare cohort files
files = [
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/1_1.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/1_2.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/2_1.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/2_2.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/3_1.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/3_2.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/3_3.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/3_4.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/4_1.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/4_2.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/4_3.adt.mrna.demuxed.freemuxed.h5ad',
'/ye/yelabstore2/george/Sasha_Flare_Studies/10xcount/combined.h5ads/again/4_4.adt.mrna.demuxed.freemuxed.h5ad']
flaregps = [
'1_1', '1_2', '2_1', '2_2',
'3_1', '3_2', '3_3', '3_4',
'4_1', '4_2', '4_3', '4_4']
DOUBLETS_dict = dict()
# Concatenate files
for ii in range(len(files)):
    df = pd.read_excel('flare.covars.again.xlsx')
    # This is crucial because Untreated and Treated were run in different runs and this is the only way to map it back.
    df = df[df['RUN_WELL'] == flaregps[ii]]
    if ii ==0:
        adata = sc.read(files[ii])
        print('Initial')
        print(adata)
        #---
        # Keep only singlets
        doubletlist2 = [adata.obs['BEST'][i].split('-')[0] for i in range(len(adata.obs['BEST']))]
        adata.obs['DOUBLETS'] = doubletlist2
        adata = adata[adata.obs['DOUBLETS']=='SNG']
        print('SNG confirmation via SNG- tag in BEST')
        
        print(adata)
        inds0 = np.unique(adata.obs['BEST'].tolist())
        inds =[]
        for i in range(len(inds0)):
            inds.append('-'.join(inds0[i].split('-')[1:]))
        inds = pd.Series(inds)
        inds0 = pd.Series(inds0)
        booleanlist = inds0[inds.isin(np.unique(df['ID'].tolist())).tolist()]
        print('Removing barcodes with no covariate data')
        adata = adata[adata.obs['BEST'].isin(booleanlist)]
        print(adata)
        
        inds0 = np.unique(adata.obs['BEST'].tolist())
        inds = []
        for i in range(len(inds0)):
            inds.append('-'.join(inds0[i].split('-')[1:]))
        
        # Preallocate space
        adata.obs['BEST'] = adata.obs['BEST'].astype('object')
        adata.obs['SOURCE'] = adata.obs['BEST']
        adata.obs['STATUS'] = adata.obs['BEST']
        adata.obs['TR_UNT'] = adata.obs['BEST']
        adata.obs['ind_cov'] = adata.obs['BEST']
        for xcv in range(len(inds)):
            adata.obs['SOURCE'][adata.obs['BEST']==inds0[xcv]] = df['SOURCE'][df['ID'].isin([inds[xcv]])].values[0]
            adata.obs['STATUS'][adata.obs['BEST']==inds0[xcv]] = df['STATUS'][df['ID'].isin([inds[xcv]])].values[0]
            adata.obs['TR_UNT'][adata.obs['BEST']==inds0[xcv]] = df['TR/UNT'][df['ID'].isin([inds[xcv]])].values[0]
            if df['ALIAS'][df['ID'].isin([inds[xcv]])].values[0] == 'FLARE2,3,or8':
                adata.obs['ind_cov'][adata.obs['BEST']==inds0[xcv]] = df['ID'][df['ID'].isin([inds[xcv]])].values[0].split(',')[0]
            else:
                adata.obs['ind_cov'][adata.obs['BEST']==inds0[xcv]] = df['ALIAS'][df['ID'].isin([inds[xcv]])].values[0]
           
        del adata.obs['N.SNP']
        del adata.obs['BEST']
        del adata.obs['mUMI']
        del adata.obs['aUMI']
        del adata.obs['NUM.SNPS']
        del adata.obs['DROPLET.TYPE']
        del adata.obs['BEST.GUESS']
        del adata.obs['DOUBLETS']
        #---
        print(str('Structure details: ' + str(adata)))
        adata.obs['well'] = np.repeat(flaregps[ii], len(adata.obs_names))
        adata.obs['batch_cov'] = np.repeat(flaregps[ii][:-2], len(adata.obs_names))
    else:
        bdata = sc.read(files[ii])
        print('Initial')
        print(bdata)
        #---
        # Keep only singlets
        doubletlist2 = [bdata.obs['BEST'][i].split('-')[0] for i in range(len(bdata.obs['BEST']))]
        bdata.obs['DOUBLETS'] = doubletlist2
        bdata = bdata[bdata.obs['DOUBLETS']=='SNG']
        print('SNG confirmation via SNG- tag in BEST')
        
        print(bdata)
        inds0 = np.unique(bdata.obs['BEST'].tolist())
        inds = []
        for i in range(len(inds0)):
            inds.append('-'.join(inds0[i].split('-')[1:]))
        inds = pd.Series(inds)
        inds0 = pd.Series(inds0)
        booleanlist = inds0[inds.isin(np.unique(df['ID'].tolist())).tolist()]
        print('Removing barcodes with no covariate data')
        bdata = bdata[bdata.obs['BEST'].isin(booleanlist)]
        print(bdata)
        
        inds0 = np.unique(bdata.obs['BEST'].tolist())
        inds = []
        for i in range(len(inds0)):
            inds.append('-'.join(inds0[i].split('-')[1:]))
        
        # Preallocate space
        bdata.obs['BEST'] = bdata.obs['BEST'].astype('object')
        bdata.obs['SOURCE'] = bdata.obs['BEST']
        bdata.obs['STATUS'] = bdata.obs['BEST']
        bdata.obs['TR_UNT'] = bdata.obs['BEST']
        bdata.obs['ind_cov'] = bdata.obs['BEST']
        for xcv in range(len(inds)):
            bdata.obs['SOURCE'][bdata.obs['BEST']==inds0[xcv]] = df['SOURCE'][df['ID'].isin([inds[xcv]])].values[0]
            bdata.obs['STATUS'][bdata.obs['BEST']==inds0[xcv]] = df['STATUS'][df['ID'].isin([inds[xcv]])].values[0]
            bdata.obs['TR_UNT'][bdata.obs['BEST']==inds0[xcv]] = df['TR/UNT'][df['ID'].isin([inds[xcv]])].values[0]
            if df['ALIAS'][df['ID'].isin([inds[xcv]])].values[0] == 'FLARE2,3,or8':
                bdata.obs['ind_cov'][bdata.obs['BEST']==inds0[xcv]] = df['ID'][df['ID'].isin([inds[xcv]])].values[0].split(',')[0]
            else:    
                bdata.obs['ind_cov'][bdata.obs['BEST']==inds0[xcv]] = df['ALIAS'][df['ID'].isin([inds[xcv]])].values[0]
                               
        del bdata.obs['N.SNP']
        del bdata.obs['BEST']
        del bdata.obs['mUMI']
        del bdata.obs['aUMI']
        del bdata.obs['NUM.SNPS']
        del bdata.obs['DROPLET.TYPE']
        del bdata.obs['BEST.GUESS']
        del bdata.obs['DOUBLETS']
        #---
        bdata.obs['well'] = np.repeat(flaregps[ii], len(bdata.obs_names))
        bdata.obs['batch_cov'] = np.repeat(flaregps[ii][:-2], len(bdata.obs_names))
        # Concatenate files.
        adata = adata.concatenate(bdata)
        print(str('Structure details: ' + str(adata)))

Only considering the two last: ['.freemuxed', '.h5ad'].
Only considering the two last: ['.freemuxed', '.h5ad'].
Initial
AnnData object with n_obs × n_vars = 12401 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS'
    var: 'gene_ids', 'protein?'
SNG confirmation via SNG- tag in BEST
View of AnnData object with n_obs × n_vars = 8948 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Removing barcodes with no covariate data
View of AnnData object with n_obs × n_vars = 8948 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Structure details: AnnData object with n_obs × n_vars = 8948 × 32758 
    obs: 'SOURCE', 'STATUS', 'TR_UNT', 'ind_cov'
    var: 'gene_ids', 'protein?'
Only considering the two last: ['.freemuxed', '.h5ad'].
Only considering the two last: ['.freemuxed', '.h5ad'].
Initial
AnnData object with n_obs × n_vars = 12648 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS'
    var: 'gene_ids', 'protein?'
SNG confirmation via SNG- tag in BEST
View of AnnData object with n_obs × n_vars = 9458 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Removing barcodes with no covariate data
View of AnnData object with n_obs × n_vars = 9458 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Structure details: AnnData object with n_obs × n_vars = 18406 × 32758 
    obs: 'SOURCE', 'STATUS', 'TR_UNT', 'batch', 'batch_cov', 'ind_cov', 'well'
    var: 'gene_ids-0', 'protein?-0', 'gene_ids-1', 'protein?-1'
Only considering the two last: ['.freemuxed', '.h5ad'].
Only considering the two last: ['.freemuxed', '.h5ad'].
Initial
AnnData object with n_obs × n_vars = 12635 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS'
    var: 'gene_ids', 'protein?'
SNG confirmation via SNG- tag in BEST
View of AnnData object with n_obs × n_vars = 7789 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Removing barcodes with no covariate data
View of AnnData object with n_obs × n_vars = 7789 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Structure details: AnnData object with n_obs × n

Initial
AnnData object with n_obs × n_vars = 23259 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS'
    var: 'gene_ids', 'protein?'
SNG confirmation via SNG- tag in BEST
View of AnnData object with n_obs × n_vars = 17449 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Removing barcodes with no covariate data
View of AnnData object with n_obs × n_vars = 16725 × 32758 
    obs: 'N.SNP', 'BEST', 'mUMI', 'aUMI', 'NUM.SNPS', 'DROPLET.TYPE', 'BEST.GUESS', 'DOUBLETS'
    var: 'gene_ids', 'protein?'
Structure details: AnnData object with n_obs × n_vars = 110769 × 32758 
    obs: 'SOURCE', 'STATUS', 'TR_UNT', 'batch', 'batch_cov', 'ind_cov', 'well'
    var: 'gene_ids-0-0-0-0-0-0-0-0', 'protein?-0-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0-0', 'protein?-1-0-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0-0', 'protein?-1-0-0-0-0-0-0', 'gene_ids-1-0-0-0-0-0', 'protein?-1-0-0-0-0-0', 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# House Keeping

In [4]:
# Fix this mislabeling.
adata.obs.ind_cov = adata.obs.ind_cov.astype('object')
adata.obs.ind_cov = adata.obs.ind_cov.astype('str')
# Mislabeled by wrong excel label.
adata.obs.ind_cov[adata.obs.ind_cov=='101'] = '1240'
adata.obs.ind_cov[adata.obs.ind_cov=='101_101'] = '1240'
adata.obs.ind_cov[adata.obs.ind_cov=='0_900805200_900805200'] = '1804_1804'
adata.obs.ind_cov[adata.obs.ind_cov=='0_904194200_904194200'] = '904194200_904194200'
adata.obs.ind_cov[adata.obs.ind_cov=='0_1586_1586'] = '1586_1586'
adata.obs.ind_cov[adata.obs.ind_cov=='0_1763_1763'] = '1763_1763'
# Change to healthy to be more descriptive.
adata.obs['TR_UNT'][adata.obs['TR_UNT']=='-'] = 'HEALTHY'
# Remove '-' person...
adata = adata[adata.obs['ind_cov']!='-']
# 1333 is unknown. Remove.
adata = adata[adata.obs['ind_cov']!='1333_1333']
# 1772 is unknown. Remove.
adata = adata[adata.obs['ind_cov']!='1772']
# 2132DNA01' is unknown. Remove.
adata = adata[adata.obs['ind_cov']!='2132DNA01']
# 0_902320200_902320200 does not exist. Remove.
adata = adata[adata.obs['ind_cov']!='0_902320200_902320200']
print('Initial ind labels')
ind_list = np.unique(adata.obs['ind_cov'].tolist())
print(ind_list)
# Simplify labels
for ii in range(len(adata.obs['ind_cov'])):
    if adata.obs['ind_cov'][ii].startswith('0_'):
        adata.obs['ind_cov'][ii] = '_'.join(adata.obs['ind_cov'][ii].split('_')[1:])
print('Simplified labels')        
ind_list = np.unique(adata.obs['ind_cov'].tolist())
print(ind_list)
# Remove individuals with barcode counts less than 100
for ii in range(len(ind_list)):
    if np.sum(adata.obs['ind_cov']==ind_list[ii]) < 100:
        adata = adata[adata.obs['ind_cov']!=ind_list[ii]]
    else:
        continue
ind_list = np.unique(adata.obs['ind_cov'].tolist())
print(ind_list)

## from runs 1 and 2
## these are labeled as healthy from CLUES but pretty sure they are treated CLUES samples
## and not true healthies
adata.obs['TR_UNT'] = adata.obs['TR_UNT'].astype('object')
adata.obs.loc[adata.obs.ind_cov == "1754_1754",'TR_UNT'] = "TR"
adata.obs.loc[adata.obs.ind_cov == "1771_1771",'TR_UNT'] = "TR"
adata.obs.loc[adata.obs.ind_cov == "1791_1791",'TR_UNT'] = "TR"
adata.obs.loc[adata.obs.ind_cov == "900805200_900805200",'TR_UNT'] = "TR"
adata.obs.loc[adata.obs.ind_cov == "1240",'TR_UNT'] = "TR"
adata.obs.loc[adata.obs.ind_cov == "1804_1804",'TR_UNT'] = "TR"

# Based on feature maps, we see near complete B cell depletion with two of three individuals in dataset. Rituximab treatment
# would result in an effect like this, therefore we can establish the treatment/untreated identity of each subject.
adata.obs['TR_UNT'] = adata.obs['TR_UNT'].astype('str')
# 904194200_904194200 #
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (1 TR, 2 UNT)') & (adata.obs['ind_cov']=='904194200_904194200')] = 'UNT'
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (2 TR, 1 UNT)') & (adata.obs['ind_cov']=='904194200_904194200')] = 'TR'

# 1586_1586 #
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (1 TR, 2 UNT)') & (adata.obs['ind_cov']=='1586_1586')] = 'TR'
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (2 TR, 1 UNT)') & (adata.obs['ind_cov']=='1586_1586')] = 'UNT'

# 1763_1763 # TR in Unknown (2 TR, 1 UNT)
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (1 TR, 2 UNT)') & (adata.obs['ind_cov']=='1763_1763')] = 'UNT'
adata.obs['TR_UNT'][(adata.obs['TR_UNT']=='Unknown (2 TR, 1 UNT)') & (adata.obs['ind_cov']=='1763_1763')] = 'TR'
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A

Initial ind labels
['0_904464200_904464200' '1240' '1586_1586' '1754_1754' '1763_1763'
 '1771_1771' '1791_1791' '1804_1804' '904194200_904194200' 'FLARE004'
 'FLARE005' 'FLARE006' 'FLARE007' 'FLARE009' 'FLARE011' 'FLARE013'
 'FLARE014' 'FLARE015' 'FLARE016' 'FLARE017' 'FLARE020' 'HC-002' 'HC-014'
 'HC-020' 'HC-501' 'HC-502' 'HC-503' 'HC-504' 'HC-506' 'IGTB1290'
 'IGTB1372' 'IGTB1506' 'IGTB1921' 'IGTB195' 'IGTB256' 'IGTB469' 'IGTB508'
 'IGTB514' 'IGTB826']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Simplified labels
['1240' '1586_1586' '1754_1754' '1763_1763' '1771_1771' '1791_1791'
 '1804_1804' '904194200_904194200' '904464200_904464200' 'FLARE004'
 'FLARE005' 'FLARE006' 'FLARE007' 'FLARE009' 'FLARE011' 'FLARE013'
 'FLARE014' 'FLARE015' 'FLARE016' 'FLARE017' 'FLARE020' 'HC-002' 'HC-014'
 'HC-020' 'HC-501' 'HC-502' 'HC-503' 'HC-504' 'HC-506' 'IGTB1290'
 'IGTB1372' 'IGTB1506' 'IGTB1921' 'IGTB195' 'IGTB256' 'IGTB469' 'IGTB508'
 'IGTB514' 'IGTB826']
['1240' '1586_1586' '1754_1754' '1763_1763' '1771_1771' '1791_1791'
 '1804_1804' '904194200_904194200' 'FLARE004' 'FLARE005' 'FLARE006'
 'FLARE007' 'FLARE009' 'FLARE011' 'FLARE013' 'FLARE014' 'FLARE015'
 'FLARE016' 'FLARE017' 'FLARE020' 'HC-002' 'HC-014' 'HC-020' 'HC-501'
 'HC-502' 'HC-503' 'HC-504' 'HC-506' 'IGTB1290' 'IGTB1372' 'IGTB1506'
 'IGTB1921' 'IGTB195' 'IGTB256' 'IGTB469' 'IGTB508' 'IGTB514' 'IGTB826']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

# Add female and broad covariates

In [5]:
# preallocate space
adata.obs['Female'] = adata.obs.ind_cov.astype('object')
adata.obs['pop_cov'] = adata.obs.ind_cov.astype('object')
genderdict = {'1240':1, '1716_1716':1, '1804_1804':1,'1754_1754':1, '1731_1731':1, '1763_1763':1, 
'1771_1771':1, '1791_1791':1, '1892_1892':1,'904236200_904236200':1, '1243_1243':1, 'FLARE004':1, 'FLARE005':1, 'FLARE006':1, 'FLARE007':1,
 'FLARE009':1, 'FLARE011':1, 'FLARE013':1, '904194200_904194200':1, '1586_1586':1, '1763_1763':1,'FLARE014':1, 'FLARE015':1,
 'FLARE016':1, 'FLARE017':1, 'FLARE020':1,'FLARE2,3,or8':1, 'HC-002':0, 'HC-014':1,
 'HC-020':1, 'HC-501':1, 'HC-502':1, 'HC-503':1, 'HC-504':1, 'HC-506':1,
 'IGTB1290':1, 'IGTB1372':1, 'IGTB1506':1, 'IGTB1921':1, 'IGTB195':1, 'IGTB256':1, 'IGTB469':1,
 'IGTB508':1, 'IGTB514':1, 'IGTB826':1}

popdict = {'1240':'ASIAN', '1716_1716':'WHITE', '1804_1804':'ASIAN','1754_1754':'ASIAN', '1731_1731':'WHITE', '1763_1763':'ASIAN', 
'1771_1771':'ASIAN', '904194200_904194200':'WHITE', '1586_1586':"ASIAN", '1763_1763':'ASIAN' ,'1791_1791':'ASIAN', '1892_1892':'WHITE', '904236200_904236200':'ASIAN', '1243_1243':'WHITE', 
           'FLARE004':'ASIAN', 'FLARE005':'WHITE', 'FLARE006':'WHITE', 'FLARE007':'ASIAN',
 'FLARE009':'HISPANIC', 'FLARE011':'ASIAN', 'FLARE013':'BLACK', 'FLARE014':'BLACK', 'FLARE015':'ASIAN',
 'FLARE016':'ASIAN', 'FLARE017':'BLACK', 'FLARE020':'WHITE','FLARE2,3,or8':'UNKNOWN', 'HC-002':'WHITE', 'HC-014':'ASIAN',
 'HC-020':'UNKNOWN', 'HC-501':'ASIAN', 'HC-502':'WHITE', 'HC-503':'WHITE', 'HC-504':'WHITE', 'HC-506':'WHITE',
 'IGTB1290':'WHITE', 'IGTB1372':'WHITE', 'IGTB1506':'WHITE', 'IGTB1921':'WHITE', 'IGTB195':'WHITE', 'IGTB256':'WHITE', 'IGTB469':'WHITE',
 'IGTB508':'WHITE', 'IGTB514':'WHITE', 'IGTB826':'WHITE'}

for key in genderdict.keys():
    adata.obs.Female[adata.obs.ind_cov==key] = genderdict[key]
    
for key in popdict.keys():
    adata.obs.pop_cov[adata.obs.ind_cov==key] = popdict[key]
    
# Preallocate space
adata.obs['Broad'] = adata.obs.SOURCE.astype('object')
# if immvar is source, assign 1.
adata.obs['Broad'][adata.obs.SOURCE == 'IMMVAR'] = 1
adata.obs['Broad'][adata.obs.SOURCE != 'IMMVAR'] = 0

adata.obs['Female'] = adata.obs['Female'].astype(dtype=np.float32)
adata.obs['Broad'] = adata.obs['Broad'].astype(dtype=np.float32)
adata.write(savepath, compression="gzip")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
... storing 'SOURCE' as categorical
... storing 'STATUS' as categorical
... storing 'TR_UNT' as categorical
... storing 'batch_cov' as categorical
... storing 'ind_cov' as categorical
... storing 'well' as categorical
... storing 'pop_cov' as c

# Fix a bug and move Ab data out of gene matrix and take Center-log-ratio

In [6]:
from skbio.stats.composition import clr
adata = sc.read(savepath)
# Fix gene endings to be compatible with CLUESIMMVAR DataFrame
genelist = adata.var_names.tolist()
genelist = [gn[:-2] for gn in genelist]
adata.var_names = genelist
# Find antibodies
ab_names = [ab for ab in genelist if 'pAbO' in ab]
# Find antibody index
ab_index = [genelist.index(ab) for ab in ab_names]
# Make new adata structure with just the antibody results
ABdata = adata[:, ab_index]
mat = csr_matrix(ABdata.X)
mat = mat.todense()
'''
print('Add 1')
matrix = np.array(mat+1)
print('Center-Log-Ratio')
matrix = clr(matrix)
'''
matrix = mat
print(str('abmatrix shape: ' + str(np.shape(matrix))))
print(str('abmatrix details: ' + str(matrix)))
# Make antibody into observations
for ii in range(len(ab_names)):
    LIST = np.asarray(matrix[:,ii])
    adata.obs[ab_names[ii]] = LIST
    
# Remove antibodies from gene list
abbool = [not ab for ab in adata.var_names.isin(ab_names)]
adata = adata[:, abbool]
print(str('Structure details: ' + str(adata)))

Only considering the two last: ['.flareAb', '.h5ad'].
Only considering the two last: ['.flareAb', '.h5ad'].
abmatrix shape: (147987, 20)
abmatrix details: [[1.97e+02 2.34e+02 5.40e+01 ... 6.80e+01 5.10e+01 9.23e+02]
 [3.14e+02 2.28e+02 5.10e+01 ... 6.90e+01 5.70e+01 8.30e+02]
 [3.80e+02 3.77e+02 8.70e+01 ... 8.60e+01 7.20e+01 2.09e+03]
 ...
 [1.55e+02 1.00e+00 8.00e+01 ... 4.10e+01 4.80e+01 2.70e+02]
 [1.09e+02 3.00e+00 6.80e+01 ... 5.10e+01 5.10e+01 3.93e+02]
 [3.63e+02 6.00e+00 4.80e+01 ... 3.80e+01 4.10e+01 4.35e+02]]
Structure details: View of AnnData object with n_obs × n_vars = 147987 × 32738 
    obs: 'SOURCE', 'STATUS', 'TR_UNT', 'batch', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-09|pAbO', 'CD19|CD19|j95-10|pAbO', 'CD161|KLRB1|j95-11|pAbO', 'CD183|CXCR3|j95-12|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD3|CD3E|j95-16|pAbO', 'CD196|CCR6|j95-17|pAbO', 'HLA-D

# Remove all var lists to clean up structure

In [7]:
for key in list(adata.var.keys()):
    del adata.var[key]
adata.obs['disease_cov'] = adata.obs['TR_UNT']
del adata.obs['batch']
del adata.obs['STATUS']
adata.obs

Unnamed: 0_level_0,SOURCE,TR_UNT,batch_cov,ind_cov,well,Female,pop_cov,Broad,CD38|CD38|j95-01|pAbO,CD95|FAS|j95-02|pAbO,CD28|CD28|j95-07|pAbO,CD27|CD27|j95-08|pAbO,CD197|CCR7|j95-09|pAbO,CD19|CD19|j95-10|pAbO,CD161|KLRB1|j95-11|pAbO,CD183|CXCR3|j95-12|pAbO,CD4|CD4|j95-14|pAbO,CD3|CD3E|j95-16|pAbO,CD196|CCR6|j95-17|pAbO,HLA-DR|CD74|j95-18|pAbO,CD45RO|PTPRC|j95-19|pAbO,CD14|CD14|j95-20|pAbO,CD194|CCR4|j95-21|pAbO,CD185|CXCR5|j95-22|pAbO,CD25|IL2RA|j95-24|pAbO,CD8|CD8A|j95-25|pAbO,CD127|IL7R|j95-27|pAbO,CD45RA|PTPRC|j95-28|pAbO,disease_cov
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
AAACCTGAGCAATCTC-1-0-0-0-0-0-0-0-0-0-0-0,IMMVAR,HEALTHY,1,IGTB256,1_1,1.0,WHITE,1.0,197.0,234.0,54.0,52.0,83.0,9.0,76.0,117.0,71.0,84.0,87.0,446.0,50.0,328.0,49.0,19.0,23.0,68.0,51.0,923.0,HEALTHY
AAACCTGAGCTACCTA-1-0-0-0-0-0-0-0-0-0-0-0,CLUES,TR,1,1804_1804,1_1,1.0,ASIAN,0.0,314.0,228.0,51.0,73.0,106.0,281.0,46.0,101.0,109.0,112.0,424.0,7203.0,56.0,207.0,50.0,183.0,21.0,69.0,57.0,830.0,TR
AAACCTGAGTATCGAA-1-0-0-0-0-0-0-0-0-0-0-0,UCSF,TR,1,1240,1_1,1.0,ASIAN,0.0,380.0,377.0,87.0,129.0,168.0,571.0,78.0,152.0,223.0,229.0,342.0,15863.0,84.0,319.0,73.0,148.0,43.0,86.0,72.0,2090.0,TR
AAACCTGAGTCTCGGC-1-0-0-0-0-0-0-0-0-0-0-0,IMMVAR,HEALTHY,1,IGTB1921,1_1,1.0,WHITE,1.0,846.0,457.0,99.0,139.0,138.0,43.0,99.0,191.0,822.0,232.0,161.0,13221.0,254.0,2758.0,78.0,28.0,44.0,80.0,84.0,440.0,HEALTHY
AAACCTGCAAGTCATC-1-0-0-0-0-0-0-0-0-0-0-0,CLUES,TR,1,1804_1804,1_1,1.0,ASIAN,0.0,1237.0,3050.0,768.0,2372.0,957.0,75.0,362.0,1145.0,4912.0,2036.0,1366.0,3627.0,363.0,3596.0,649.0,51.0,250.0,684.0,590.0,3658.0,TR
AAACCTGCAGCTTAAC-1-0-0-0-0-0-0-0-0-0-0-0,UCSF,TR,1,904194200_904194200,1_1,1.0,WHITE,0.0,425.0,772.0,251.0,2098.0,172.0,14.0,60.0,469.0,218.0,3142.0,241.0,502.0,79.0,256.0,67.0,22.0,38.0,1296.0,125.0,4513.0,TR
AAACCTGCATAACCTG-1-0-0-0-0-0-0-0-0-0-0-0,IMMVAR,HEALTHY,1,IGTB256,1_1,1.0,WHITE,1.0,842.0,576.0,135.0,173.0,199.0,43.0,109.0,186.0,1321.0,280.0,172.0,27313.0,115.0,1800.0,125.0,34.0,42.0,114.0,124.0,773.0,HEALTHY
AAACCTGCATCGTCGG-1-0-0-0-0-0-0-0-0-0-0-0,CLUES,TR,1,1754_1754,1_1,1.0,ASIAN,0.0,215.0,301.0,138.0,974.0,172.0,9.0,48.0,135.0,139.0,1009.0,144.0,369.0,50.0,263.0,48.0,18.0,21.0,260.0,137.0,2481.0,TR
AAACCTGCATTGCGGC-1-0-0-0-0-0-0-0-0-0-0-0,IMMVAR,HEALTHY,1,IGTB256,1_1,1.0,WHITE,1.0,556.0,305.0,63.0,81.0,131.0,466.0,59.0,110.0,134.0,108.0,602.0,5871.0,59.0,264.0,36.0,164.0,26.0,58.0,57.0,2228.0,HEALTHY
AAACCTGGTAAATGTG-1-0-0-0-0-0-0-0-0-0-0-0,IMMVAR,HEALTHY,1,IGTB256,1_1,1.0,WHITE,1.0,186.0,291.0,203.0,1734.0,152.0,11.0,60.0,100.0,2453.0,1710.0,108.0,376.0,56.0,193.0,29.0,14.0,32.0,52.0,131.0,1295.0,HEALTHY


# Process flare cohort

In [8]:
import numpy as np
import scanpy.api as sc
from scipy.sparse import csr_matrix
import logging
import doubletdetection
import pandas as pd
import combat2
import combat
import patsy

##################
# Configure file #
##################
sc.settings.verbosity = 2
sc.settings.autoshow = False
logging.basicConfig(level=logging.INFO)
adata.obs['well'] = adata.obs['well'].astype('category')
adata.var_names_make_unique()
logging.info(str('Data structure details: ' + str(adata)))
mat = csr_matrix(adata.X)
logging.info(str('Data structure details: ' + str(adata)))
# Extract list of genes
genelist = adata.var_names.tolist()
# Find mitochondrial genes
mito_genes_names = [gn for gn in genelist if gn.startswith('MT-')]
logging.info(str('Mito genes: ' + str(mito_genes_names)))
# Find indices of mitochondrial genes
mito_genes = [genelist.index(gn) for gn in mito_genes_names]
# For each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mito'] = np.ravel(np.sum(adata[:, mito_genes].X, axis=1)) / np.ravel(np.sum(adata.X, axis=1))
# Add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = np.ravel(adata.X.sum(axis=1))
# Fix name to make it compatible with clinical variates
adata.obs['ind_cov'] = adata.obs['ind_cov'].astype('object')
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1221'])] = '1221_1221'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1251'])] = '1251_1251'
adata.obs['ind_cov'][adata.obs['ind_cov'].isin(['1891'])] = '1891_1891'
indlist = np.unique(np.asarray(adata.obs['ind_cov'].tolist()))
# Get list of batches
batch_list = adata.obs['batch_cov'].tolist()
logging.info('Filtering cells')
# Filter cells that have more than 10% of counts coming from mitochondrial genes.
adata = adata[adata.obs['percent_mito'] < 0.10]
logging.info(str('Data structure details: ' + str(adata)))
# Filter cells with abnormally low gene counts, high gene counts.
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_cells(adata, max_genes=2500)
sc.pp.filter_genes(adata, min_cells=100)
logging.info(str('Data structure details: ' + str(adata)))
logging.info('Saving raw counts')
adata.uns['barcodes'] = adata.obs_names.tolist()
adata.uns['genes'] = adata.var_names.tolist()
adata.uns['raw_counts'] = adata.X
logging.info('Normalizing total counts to 10,000')
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
logging.info('Log transforming data')
sc.pp.log1p(adata)
logging.info('Saving log(counts)+1 in .raw')
adata.raw = adata

# Add platelet genes for purposes of regressing out their signature
adata.obs['PF4'] = adata.raw[:,"PF4"].X
adata.obs['SDPR'] = adata.raw[:,"SDPR"].X
adata.obs['GNG11'] = adata.raw[:,"GNG11"].X
adata.obs['PPBP'] = adata.raw[:,"PPBP"].X
logging.info('Making .obs into categories')
adata.strings_to_categoricals()
logging.info('Filtering genes')
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=True)
adata = adata[:, adata.var['highly_variable']]
logging.info(str('Data structure details: ' + str(adata)))
sc.pp.scale(adata, max_value=10)

## compute PCA
sc.pp.pca(adata, random_state=1, svd_solver='arpack')

## removing the two components most highly correlated with platelet counts
to_remove = pd.Series()
for ii in range(len(adata.obsm.X_pca[1,:])) :
    logging.info(str(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1]))
    if(np.absolute(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])[0,1]) > 0.1):
        adata.obs["PC"+str(ii)] = adata.obsm.X_pca[:,ii];
        to_remove = to_remove.append(pd.Series(["PC"+str(ii)]))
        logging.info(str(ii))
        logging.info(str(np.corrcoef(adata.obsm.X_pca[:,ii], adata.obs["PF4"])))

regressors = ['n_counts', 'percent_mito', 'Female']+to_remove.tolist();

logging.info('Regressing out total nUMIs, percentage mitochondrial UMIs, gender, platelet signature and pool')
logging.info('regressors: '+str(regressors))

sc.pp.regress_out(adata, regressors)

logging.info('Scaling expression data')
sc.pp.scale(adata, max_value=10)


## combat adjustment                                                                                                                                                                                                           
logging.info('Running combat')
mod = patsy.dmatrix("~ disease_cov", adata.obs, return_type="dataframe")
mod = mod.reset_index(drop=True)
batch = adata.obs['batch_cov']
batch = batch.reset_index(drop=True)
adata.X = combat.combat(adata.X.transpose(), batch=batch, model=mod).transpose();

logging.info(str('Data structure details: ' + str(adata)))
logging.info('Removing samples that were processed twice...')
# Remove samples that were processed twice and get rid of their 1.10 run
remove = ['902289200_902289200', '1262_1262', '1270_1270', '1279_1279']
# Remove samples that were processed twice and get rid of their 8.9 run
# since the 8.9 run has the fewest number of cells, this adjustment is important for testing for cell proportion differences
# going forward, we will want to combine batches for the same individuals
remove2 = ['1472_1472', '1479_1479', '1480_1480', '1492_1492', '1522_1522', '1535_1535', '1602_1602', '1615_1615', '1621_1621', '1716_1716', '1726_1726', '1730_1730']
keep_indices = ~((adata.obs.ind_cov.isin(remove)) & (adata.obs.batch_cov == "lupus1.10"))
keep2_indices = ~((adata.obs.ind_cov.isin(remove2)) & (adata.obs.batch_cov == "lupus8.9"))
adata = adata[(keep_indices & keep2_indices)]
logging.info(str('Data structure details: ' + str(adata)))
# Unique list of individuals
people = np.unique(adata.obs['ind_cov'].values.tolist())
# Allocate space for total PMBCs per individual.
total_pbmcs = dict.fromkeys(people)
for p in people:
    total_pbmcs[p] = len(adata.obs_names[adata.obs['ind_cov'] == p])
adata.uns['total_pbmcs'] = total_pbmcs
logging.info('Saving processed data')
# Remove .var information we never use.
for key in list(adata.var.keys()):
    del adata.var[key]
logging.info(str('Structure details: ' + str(adata)))
adata.write(savepath, compression="gzip")

INFO:root:Data structure details: AnnData object with n_obs × n_vars = 147987 × 32738 
    obs: 'SOURCE', 'TR_UNT', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-09|pAbO', 'CD19|CD19|j95-10|pAbO', 'CD161|KLRB1|j95-11|pAbO', 'CD183|CXCR3|j95-12|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD3|CD3E|j95-16|pAbO', 'CD196|CCR6|j95-17|pAbO', 'HLA-DR|CD74|j95-18|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD14|CD14|j95-20|pAbO', 'CD194|CCR4|j95-21|pAbO', 'CD185|CXCR5|j95-22|pAbO', 'CD25|IL2RA|j95-24|pAbO', 'CD8|CD8A|j95-25|pAbO', 'CD127|IL7R|j95-27|pAbO', 'CD45RA|PTPRC|j95-28|pAbO', 'disease_cov'
INFO:root:Data structure details: AnnData object with n_obs × n_vars = 147987 × 32738 
    obs: 'SOURCE', 'TR_UNT', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-0

filtered out 802 cells that have more than  2500 genes expressed
filtered out 18018 genes that are detected in less than 100 cells


INFO:root:Data structure details: AnnData object with n_obs × n_vars = 145158 × 14720 
    obs: 'SOURCE', 'TR_UNT', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-09|pAbO', 'CD19|CD19|j95-10|pAbO', 'CD161|KLRB1|j95-11|pAbO', 'CD183|CXCR3|j95-12|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD3|CD3E|j95-16|pAbO', 'CD196|CCR6|j95-17|pAbO', 'HLA-DR|CD74|j95-18|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD14|CD14|j95-20|pAbO', 'CD194|CCR4|j95-21|pAbO', 'CD185|CXCR5|j95-22|pAbO', 'CD25|IL2RA|j95-24|pAbO', 'CD8|CD8A|j95-25|pAbO', 'CD127|IL7R|j95-27|pAbO', 'CD45RA|PTPRC|j95-28|pAbO', 'disease_cov', 'percent_mito', 'n_counts', 'n_genes'
    var: 'n_cells'
INFO:root:Saving raw counts
INFO:root:Normalizing total counts to 10,000
INFO:root:Log transforming data
INFO:root:Saving log(counts)+1 in .raw
INFO:root:Making .obs into categories
... storing 'ind_cov' as categorical
INFO:root:Filteri

computing PCA on highly variable genes


INFO:root:0.015616517051490964
INFO:root:-0.020953365160505253
INFO:root:0.0352976257426313
INFO:root:0.6059113886605495
INFO:root:3
INFO:root:[[1.         0.60591139]
 [0.60591139 1.        ]]
INFO:root:-0.02594449990327925
INFO:root:-0.07766348812266483
INFO:root:-0.02932749150047325
INFO:root:-0.027212744450640928
INFO:root:-0.04218847421671319
INFO:root:-0.01258502087888143
INFO:root:-0.016249119394006303
INFO:root:-0.01768577061988765
INFO:root:-0.04150431817404665
INFO:root:-0.016228595960448015
INFO:root:0.013389179407830302
INFO:root:-0.008563623468781912
INFO:root:-0.234156020417521
INFO:root:16
INFO:root:[[ 1.         -0.23415602]
 [-0.23415602  1.        ]]
INFO:root:-0.0524290734585791
INFO:root:0.036827273744469285
INFO:root:0.021002946160266864
INFO:root:0.012060138098899375
INFO:root:0.014257425489404705
INFO:root:-0.02055868503192482
INFO:root:-0.03070486792723581
INFO:root:-0.0047434647022468164
INFO:root:-0.008532411904352104
INFO:root:-0.004023593917164884
INFO:root:

regressing out ['n_counts', 'percent_mito', 'Female', 'PC3', 'PC16']


INFO:root:Scaling expression data


    finished (0:03:32.20)


INFO:root:Running combat
found 4 batches
found 0 numerical covariates...
found 2 categorical variables:	disease_cov[T.TR], disease_cov[T.UNT]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


Adjusting data


INFO:root:Data structure details: AnnData object with n_obs × n_vars = 145158 × 1161 
    obs: 'SOURCE', 'TR_UNT', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-09|pAbO', 'CD19|CD19|j95-10|pAbO', 'CD161|KLRB1|j95-11|pAbO', 'CD183|CXCR3|j95-12|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD3|CD3E|j95-16|pAbO', 'CD196|CCR6|j95-17|pAbO', 'HLA-DR|CD74|j95-18|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD14|CD14|j95-20|pAbO', 'CD194|CCR4|j95-21|pAbO', 'CD185|CXCR5|j95-22|pAbO', 'CD25|IL2RA|j95-24|pAbO', 'CD8|CD8A|j95-25|pAbO', 'CD127|IL7R|j95-27|pAbO', 'CD45RA|PTPRC|j95-28|pAbO', 'disease_cov', 'percent_mito', 'n_counts', 'n_genes', 'PF4', 'SDPR', 'GNG11', 'PPBP', 'PC3', 'PC16'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'barcodes', 'genes', 'raw_counts', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
INFO:root:Removing samples that were proces

# Run basic analysis

In [9]:
#######################
# Louvain and friends #
#######################
# Set parameters
intialization = 1
n_components = 20
resolution = 3
# Run louvain clustering on theoretical future gene expression per cell
logging.info('Estimating louvain cluster identities for gene expression values.')
sc.pp.pca(adata, random_state=intialization, svd_solver='arpack')
logging.info('PCA complete.')
sc.pp.neighbors(adata, random_state=intialization)
logging.info('KNN complete.')
sc.tl.diffmap(adata)
logging.info('diffmap complete.')
sc.tl.louvain(adata, random_state=15, resolution=resolution)
sc.tl.leiden(adata, random_state=intialization, resolution=resolution)
logging.info('Louvain complete.')
sc.tl.umap(adata, random_state=intialization)
logging.info('UMAP complete.')
logging.info('Making .obs into categories')
adata.strings_to_categoricals()
adata.write(savepath, compression="gzip")
logging.info('Basic analysis complete.')

INFO:root:Estimating louvain cluster identities for gene expression values.
INFO:root:PCA complete.


computing neighbors
    using 'X_pca' with n_pcs = 50


INFO:root:KNN complete.


    finished (0:01:09.73)
computing Diffusion Maps using n_comps=15(=n_dcs)


INFO:root:diffmap complete.


    eigenvalues of transition matrix
    [1.         0.9990519  0.99892676 0.9973141  0.99324816 0.99195755
     0.9907578  0.98999166 0.9893931  0.9848406  0.97949946 0.97781396
     0.97334594 0.96778125 0.9666325 ]
    finished (0:00:12.64)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished (0:00:47.56)
running Leiden clustering
    finished (0:04:14.07)


INFO:root:Louvain complete.


computing UMAP


INFO:root:UMAP complete.
INFO:root:Making .obs into categories


    finished (0:03:39.47)


INFO:root:Basic analysis complete.


# Quality Control - antibody abundance normalization

In [10]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
adata = sc.read(savepath)
print(adata)
unique_ind = np.unique(adata.obs['ind_cov'].tolist())
print(unique_ind)
for ii in range(len(unique_ind)):
    try:
        plt.figure()
        print('Individual:')
        print(unique_ind[ii])
        bdata = adata[adata.obs.ind_cov==unique_ind[ii]]
        print('Number of cells:')
        print(bdata.n_obs)
        sns.regplot(x=bdata.X[:,bdata.var_names.isin(['CD14'])], y=bdata.obs['CD14|CD14|j95-20|pAbO'])
        #plt.plot([-2, 5], [-2, 5], color='k', linestyle='-', linewidth=2)
        plt.axis('equal')
        plt.show()
    except:
        continue

Only considering the two last: ['.flareAb', '.h5ad'].
Only considering the two last: ['.flareAb', '.h5ad'].
AnnData object with n_obs × n_vars = 145158 × 1161 
    obs: 'SOURCE', 'TR_UNT', 'batch_cov', 'ind_cov', 'well', 'Female', 'pop_cov', 'Broad', 'CD38|CD38|j95-01|pAbO', 'CD95|FAS|j95-02|pAbO', 'CD28|CD28|j95-07|pAbO', 'CD27|CD27|j95-08|pAbO', 'CD197|CCR7|j95-09|pAbO', 'CD19|CD19|j95-10|pAbO', 'CD161|KLRB1|j95-11|pAbO', 'CD183|CXCR3|j95-12|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD3|CD3E|j95-16|pAbO', 'CD196|CCR6|j95-17|pAbO', 'HLA-DR|CD74|j95-18|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD14|CD14|j95-20|pAbO', 'CD194|CCR4|j95-21|pAbO', 'CD185|CXCR5|j95-22|pAbO', 'CD25|IL2RA|j95-24|pAbO', 'CD8|CD8A|j95-25|pAbO', 'CD127|IL7R|j95-27|pAbO', 'CD45RA|PTPRC|j95-28|pAbO', 'disease_cov', 'percent_mito', 'n_counts', 'n_genes', 'PF4', 'SDPR', 'GNG11', 'PPBP', 'PC3', 'PC16', 'louvain', 'leiden'
    uns: 'barcodes', 'diffmap_evals', 'genes', 'leiden', 'louvain', 'neighbors', 'pca', 'raw_counts', 'total_pbmcs



Individual:
HC-002
Number of cells:
3985
Individual:
HC-014
Number of cells:
2921
Individual:
HC-020
Number of cells:
3734
Individual:
HC-501
Number of cells:
2755
Individual:
HC-502
Number of cells:
8034
Individual:
HC-503
Number of cells:
3043
Individual:
HC-504
Number of cells:
3253
Individual:
HC-506
Number of cells:
3232
Individual:
IGTB1290
Number of cells:
1542
Individual:
IGTB1372
Number of cells:
3671
Individual:
IGTB1506
Number of cells:
2140
Individual:
IGTB1921
Number of cells:
2101
Individual:
IGTB195
Number of cells:
3807
Individual:
IGTB256
Number of cells:
2095
Individual:
IGTB469
Number of cells:
4343
Individual:
IGTB508
Number of cells:
2932
Individual:
IGTB514
Number of cells:
3470
Individual:
IGTB826
Number of cells:
2792


In [11]:
sc.pl.umap(adata,color=['CD14', 'CD14|CD14|j95-20|pAbO'])

