# Load libraries

In [None]:
import sys
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
import logging
import os
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import scipy.stats as stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
import math
import gc
import random
from sklearn.utils import shuffle
import combat
import patsy
logging.basicConfig(level=logging.INFO)
savepath = 'Lupus_study_adjusted_V3.h5ad'

# Read data

In [None]:
%matplotlib inline
##################
# Configure file #
##################
sc.settings.verbosity = 2
sc.settings.autoshow = False
sc.settings.set_figure_params(dpi=100, dpi_save=300, format='png', frameon=False, transparent=True, fontsize=16)
plt.rcParams["image.aspect"] = "equal"
adata = sc.read(savepath, cache=True)
print(adata)
figdir = "./figures."
sc.settings.figdir = "./figures."
MASTERCOLORS = sc.pl.palettes.default_64
MASTERCOLORS.remove("#FEFFE6")
MASTERCOLORS.remove("#FFFF00")

# Projection

In [None]:
disease_umap = sc.pl.umap(adata, color=['disease_cov', 'SLE status'], size=1, show=True, edgecolor="none", palette=sc.pl.palettes.vega_20_scanpy, save='.disease.png')

In [None]:
sc.tl.embedding_density(adata, basis='umap', groupby='disease_cov')
sc.pl.embedding_density(adata, basis='umap', key='umap_density_disease_cov', show=True,  save='umap_density_disease_cov.png')

In [None]:
sc.tl.embedding_density(adata, basis='umap', groupby='disease_cov')
sc.pl.embedding_density(adata, basis='umap', key='umap_density_disease_cov', show=True,  save='umap_density_disease_cov.png')

In [None]:
disease_umap = sc.pl.umap(adata, color='batch_cov', size=3,show=True, edgecolor="none", palette=sc.pl.palettes.vega_20_scanpy, save='.batch.png')

## Plot individual variability

In [None]:
# Plot 16 plots for the individuals.
colors = ["#8ED1C6","#FCF6B5", "#BEBAD9", "#F47F72", "#81B1D3", "#FBB463", "#B4D66C", "#F9CEE1", "#DAD9D9", "#BC80B7", "#CDE6C4", "#FEEC6E", "#E31F26", "#387EB9", "#4EAF49", "#984F9F", "#8ED1C6","#FCF6B5", "#BEBAD9"];
fig,ax = plt.subplots(nrows=4,ncols=4, figsize=(7,7),sharex=True, sharey=True)
fig.tight_layout()
plt.subplots_adjust(wspace=-0.1, hspace=0)

batch_name = adata.obs.batch_cov.unique()[0]
batch = adata.obs.ind_cov_disease_cov[adata.obs.batch_cov==batch_name].unique();
batch = np.sort(batch.categories.values)
for ind_i in list(range(16)):
    ind = batch[ind_i]
    col = colors[ind_i]
    sc.pl.umap(adata[adata.obs.ind_cov_disease_cov==ind,], color = "ind_cov_disease_cov", title=None, palette=[col,col], ax=ax[(ind_i-1)%4,math.floor(ind_i/4)], size=20, edgecolor="none")
    ax[(ind_i-1)%4,math.floor(ind_i/4)].get_xaxis().set_visible(False)
    ax[(ind_i-1)%4,math.floor(ind_i/4)].get_yaxis().set_visible(False)
    ax[(ind_i-1)%4,math.floor(ind_i/4)].get_legend().remove()
    ax[(ind_i-1)%4,math.floor(ind_i/4)].set_title("")
    ax[(ind_i-1)%4,math.floor(ind_i/4)].set_aspect("equal")

fig.savefig(figdir+'/'+batch_name+'.ind.png')

# Rank genes and plot leiden groups

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
sc.pl.umap(adata, color='leiden', size=3, show=True, edgecolor="none", save='.leiden.png', palette=MASTERCOLORS)

# Most expressed genes per leiden group

In [None]:
pd.set_option('display.max_columns', None)
unique_leiden = np.unique(adata.obs['leiden'].values)
# Compile list of top genes
GeneRanks = pd.DataFrame()
for ii in range(len(unique_leiden)):
    GeneRanks[str('leiden_' + str(ii))] = adata.var_names[np.flipud(np.argsort(np.mean(adata.X[adata.obs['leiden'] == str(ii)], axis=0)))]
GeneRanks.to_csv('Flare_study_top_expression.csv')
GeneRanks.head(20)

In [None]:
adata.obs['ct_cov'] = adata.obs['ct_cov'].astype('object')
adata.obs['ct_cov'].loc[adata.obs.leiden == "0"] = "Classical Monocytes" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "1"] = "Naive T Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "2"] = "Cytotoxic T Cells" ## good                                                                                     
adata.obs['ct_cov'].loc[adata.obs.leiden == "3"] = "B Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "4"] = "Effector Memory T Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "5"] = "Natural Killer Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "6"] = "Naive T Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "7"] = "Nonclassical Monocytes" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "8"] = "Classical Dendritic Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "9"] = "Naive T Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "10"] = "Megakaryocytes" ## good                                                                                  
adata.obs['ct_cov'].loc[adata.obs.leiden == "11"] = "Plasmacytoid Dendritic Cells" ## good                                                                                    
adata.obs['ct_cov'].loc[adata.obs.leiden == "12"] = "Nonclassical Monocytes" ## good                                                                                   
adata.obs['ct_cov'].loc[adata.obs.leiden == "13"] = "Proliferating T Cells" ## good  Macro                                                                                 
adata.obs['ct_cov'].loc[adata.obs.leiden == "14"] = "Naive T Cells" ## good                                                                                  
adata.obs['ct_cov'].loc[adata.obs.leiden == "15"] = "RBCs" ## good                                                                                   
adata.obs['ct_cov'].loc[adata.obs.leiden == "16"] = "B Cells" ## good                                                                                   
adata.obs['ct_cov'].loc[adata.obs.leiden == "17"] = "Classical Monocytes" ## good                                                                                  
adata.obs['ct_cov'].loc[adata.obs.leiden == "18"] = "Progenitor Cells" ## good                                                                              
adata.obs['ct_cov'] = adata.obs.ct_cov.astype('category')
sc.settings.set_figure_params(dpi=100, dpi_save=300, format='png', frameon=False, transparent=True, fontsize=16)
plt.rcParams["image.aspect"] = "equal"
MasterORDER = ['Classical Monocytes','Nonclassical Monocytes', 'Classical Dendritic Cells', 'Plasmacytoid Dendritic Cells', 'Naive T Cells', 'Effector Memory T Cells', 'Cytotoxic T Cells', 'Proliferating T Cells', 'Natural Killer Cells', 'B Cells', 'Progenitor Cells', 'Megakaryocytes', 'RBCs']
#colorrs = ["#E58606","#5D69B1","#52BCA3","#99C945","#CC61B0","#24796C","#DAA51B","#2F8AC4","#764E9F","#ED645A","#CC3A8E",'#BC23FF', '#D790FF']
colorrs = ["#4E79A7","#A0CBE8","#F28E2B","#FFBE7D","#8CD17D","#B6992D","#499894","#E15759","#FF9D9A","#79706E","#D37295","#FABFD2", '#000000',"#B07AA1","#D4A6C8","#9D7660",
                 "#E58606", "#5D69B1", "#24796C", '#DAA51B', '#000000', '#99C945', '#ED645A']

adata.obs['ct_cov'] = adata.obs['ct_cov'].cat.reorder_categories(MasterORDER)
adata.uns['ct_cov_colors'] = colorrs
celltype_umap = sc.pl.umap(adata, color='ct_cov', show=True, size=3, edgecolor="none")

# Example antibodies

In [None]:
sc.pl.umap(adata, color=['CD45RA|PTPRC|j95-28|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD4|CD4|j95-14|pAbO', 'CD8|CD8A|j95-25|pAbO'], size=10)

# Label cell populations

# Proportions and Statistics

In [None]:
## Make proportion plots
adata_obs_small = adata.obs
ind_count = adata_obs_small.groupby(['ind_cov_disease_cov','ct_cov','disease_cov','pop_cov', 'ind_cov'])['ct_cov'].count()
ind_count_sums = ind_count.groupby(level=[0]).sum()
ind_count_sums = ind_count_sums.reset_index(name="counts")
ind_perc = ind_count/ind_count.groupby(level=[0]).transform(sum)*100
ind_perc = ind_perc.reset_index(name="ct_perc")
# Add weights to WLS
ind_perc['counts'] = ind_count.values.tolist()

ind_perc['ind_count_sum'] = list(np.zeros(len(ind_count.values.tolist()),dtype=int))
# Add total sums per individual to structure
for ii in range(len(ind_count_sums)):
    ind_perc['ind_count_sum'][ind_perc.ind_cov_disease_cov==ind_count_sums.ind_cov_disease_cov[ii]] = ind_count_sums.counts[ii]


ind_perc.ind_cov_disease_cov = ind_perc.ind_cov_disease_cov.apply(lambda x: x.split('_')[0])
ind_count_sums.ind_cov_disease_cov = ind_count_sums.ind_cov_disease_cov.apply(lambda x: x.split('_')[0])
#ind_perc = ind_perc.set_index('ct_cov').join(cg_cov.set_index('ct_cov'))
ind_perc.ind_cov_disease_cov = ind_perc.ind_cov_disease_cov.astype("str")
ind_perc.reset_index(inplace=True)
ind_perc.ct_cov = ind_perc.ct_cov.astype('category')
ind_perc.ct_cov = ind_perc.ct_cov.cat.reorder_categories(adata.obs.ct_cov.cat.categories.values)

perc_plot = sns.catplot(x='disease_cov', y='ct_perc', order=[ "Healthy", "Managed", "Treated", "Flare"], hue='ct_cov', data=ind_perc, kind='violin', col_order=MasterORDER, col='ct_cov', col_wrap=3, cut=0, dodge=False, aspect=1, sharex=False, sharey=False, palette=colorrs)

for ct_i in list(range(len(MasterORDER))):
    ct = MasterORDER[ct_i]
    sns.swarmplot(x="disease_cov", y="ct_perc", data=ind_perc[ind_perc.ct_cov == ct], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", size=4, ax=perc_plot.axes[ct_i])
    try:
        sns.pointplot(x="disease_cov", y="ct_perc", hue="ind_cov", data=ind_perc[ind_perc.ct_cov == ct], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", scale=0.3, ax=perc_plot.axes[ct_i])
    except:
        continue
    perc_plot.axes[ct_i].get_yaxis().label.set_visible(False)
    perc_plot.axes[ct_i].get_xaxis().label.set_visible(False)
    perc_plot.axes[ct_i].get_legend().remove()
    perc_plot.set_xticklabels(rotation=90)
    perc_plot.fig.subplots_adjust(wspace=2, hspace = 1)
    
perc_plot.savefig(figdir+"/violin.ct_cov_figure6b.png")
perc_plot.savefig(figdir+"/violin.ct_cov_figure6b.pdf")

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Managed WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Managed'])]
HEALTHYMANAGED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Managed"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYMANAGED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Treated WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated'])]
HEALTHYTREATED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Treated"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYTREATED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Flare'])]
HEALTHYFLARE_effect = {}
HEALTHYFLARE_FC = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Flare"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYFLARE_effect[ct] = effect
    HEALTHYFLARE_FC[ct] = math.log2(sum(est.params)/est.params[0])
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Treated vs. Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Treated', 'Flare'])]
TREATEDFLARE_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Flare"] = 0
    disease.values[disease=="Treated"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    TREATEDFLARE_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Treated vs. Managed WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Treated', 'Managed'])]
TREATEDMANAGED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Managed"] = 0
    disease.values[disease=="Treated"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    TREATEDMANAGED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. SLE WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated', 'Untreated', 'Managed'])]
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease!="Healthy"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Treated&Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated', 'Untreated'])]
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease!="Healthy"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)


In [None]:
def getproportions(celltype, group1, group2):
    print('{} proportion for celltype: {}: {}' .format(group1, celltype, ind_perc[ind_perc.disease_cov==group1][ind_perc.ct_cov==celltype].ct_perc.mean()))
    print('{} proportion for celltype: {}: {}' .format(group2, celltype, ind_perc[ind_perc.disease_cov==group2][ind_perc.ct_cov==celltype].ct_perc.mean()))

    
getproportions(celltype='CD8+ T Cells',group1='Healthy', group2='Flare')

# Feature plots for Untreated Treated pairs

In [None]:
%%capture
pairs = ['FLARE004', 'FLARE009', 'FLARE011', 'FLARE013', 'FLARE016', 'FLARE003', 'FLARE002', 'FLARE008']
colors = ["#000000", "#FF0000"]
for pair in pairs:
    bdata = adata[adata.obs['ind_cov']==pair]
    print(pair)
    sc.pl.umap(bdata, size=10, show=True, color='disease_cov', save=str('TREATED_UNTREATED_PAIR'+ pair + '.disease.png'), palette=colors)
    plt.show() 

# Map colors onto cell type labels

In [None]:
colormapping = dict()
unique_ct = adata.obs.ct_cov.cat.categories.tolist()
for ii in range(len(unique_ct)):
    colormapping[unique_ct[ii]] = adata.uns['ct_cov_colors'][ii]

# Feature plots for marker genes

In [None]:
# Atypical memory B cells
sc.pl.umap(adata, color=['CR2', 'CD27', 'FCRL5', 'ITGAX', 'NKG7'], show=False, save='.AtypicalBgenes.png')

In [None]:
## platelet effect
sc.pl.umap(adata, color=["PF4", "SDPR", "GNG11", "PPBP"], size=3, show=False, save='.platelet.png')

In [None]:
## cycling
sc.pl.umap(adata, color=["KIAA0101","STMN1","TK1","MKI67"], size=3, show=False, save='.cycling.png')

In [None]:
## MACROPHAGES
sc.pl.umap(adata, color=['CD163', 'HLA-DRB1', 'C1QA', 'IFITM3'], size=3, show=False, save=".MACRO.png")

In [None]:
sc.pl.umap(adata, color=["CD7", 'PDCD1', 'CD28'],size=3, show=True)

In [None]:
## CD4, CD8 and NK axis
sc.pl.umap(adata, color=["CD3D","CD8A","CD4","NCAM1", "FCGR3A", 'NKG7', 'IFNG', 'GZMB', 'PRF1'],size=3, show=True, save=".TNK.png")
# Two NK sub populations: CD56high/lowCD16high/low, CD56lowCD16high
# https://www.frontiersin.org/files/Articles/162361/fimmu-06-00567-HTML/image_m/fimmu-06-00567-g001.jpg

In [None]:
#sc.pl.violin(adata, keys= ['GZMB', 'PRF1', 'SH2D1A', 'ICAM1','CD86', 'RAB27A', 'UNC13D', 'FHL3', 'FHL2'], groupby='disease_cov', show=False, use_raw=False, save='.T8emcyto1expression')
#sc.pl.stacked_violin(adata, keys= ['GZMB', 'PRF1', 'SH2D1A', 'ICAM1','CD86', 'RAB27A', 'UNC13D', 'FHL3', 'FHL2'], groupby='disease_cov', show=False, use_raw=False, save='.T8emcyto1expression')
sc.pl.umap(adata, color=['GZMB', 'PRF1', 'SH2D1A', 'ICAM1','CD86', 'RAB27A', 'UNC13D', 'FHL3', 'FHL2'],size=3, show=False, use_raw=True, save=".gzmbgenes.png")
sc.pl.violin(adata, keys= ['GZMB', 'PRF1', 'SH2D1A', 'ICAM1','CD86', 'RAB27A', 'UNC13D', 'FHL3', 'FHL2'], groupby='disease_cov', show=False, use_raw=True, save='.T8emcyto1expression_raw')
#sc.pl.stacked_violin(adata, var_names= ['GZMB', 'PRF1', 'SH2D1A', 'ICAM1','CD86', 'RAB27A', 'UNC13D', 'FHL3', 'FHL2'], groupby='disease_cov', show=False, use_raw=True, save='.T8emcyto1expression_raw')


In [None]:
## Memory vs. helper vs. naive
sc.pl.umap(adata, color=["CCR7","IL7R","S100A4","CD58", 'FAS', 'IL2RA'],show=False, size=3, save=".TMemThTNaive.png")

In [None]:
## treg
sc.pl.umap(adata, color=["FOXP3","TNFRSF4","ENTPD1","CCR10"],size=3, show=False, save=".TREG.png")

In [None]:
## y chromosome gender effect
sc.pl.umap(adata, color=["DDX3Y", "RPS4Y1", "FHIT","TRAT1"], size =3, show=False, save=".Y.png")

In [None]:
## b cells, plasmablasts and pdcs
sc.pl.umap(adata, color=['BTLA', 'P2RY8',"MZB1", "CD19", "CD79A", 'MS4A1', 'FCRL5', 'IL6', 'CR2'],size=3, show=False, save=".B.png")


In [None]:
sc.pl.umap(adata, color=['ITGAM', 'CD33', 'FUT4', 'HLA-DRA', 'CD14', 'CD163'],size=3, show=False, save=".B.png")

# Cytotoxic signature: Isolate cytotoxic CD3+ cells

In [None]:
bdata = adata[adata.obs['ct_cov'].isin(['Cytotoxic T Cells'])]

In [None]:
sc.pl.umap(bdata, color=['CCR7', 'GZMB', 'PRF1', 'CD4', 'CD8A', 'leiden'], size=1)

In [None]:
sc.pl.umap(bdata, color=['CD4|CD4|j95-14|pAbO', 'CD8|CD8A|j95-25|pAbO'], size=20)

In [None]:
X = np.asarray(bdata.obs['CD4|CD4|j95-14|pAbO'].tolist())
Y = np.asarray(bdata.obs['CD8|CD8A|j95-25|pAbO'].tolist())
f, ax = plt.subplots(figsize=(7, 7))
ax = sns.scatterplot(X,Y)
plt.xlabel('CD4')
plt.ylabel('CD8A')
joint_kws=dict(gridsize=50)
sns.jointplot(X, Y, kind="hex", color="b", joint_kws= joint_kws);

In [None]:
print(np.sum((X>np.nanmax(X)*.10) & (Y<np.nanmax(Y)*.10))) # CD4 only
print(np.sum((X<np.nanmax(X)*.10) & (Y>np.nanmax(Y)*.10))) # CD8 only
print(np.sum((X<np.nanmax(X)*.10) & (Y<np.nanmax(Y)*.10))) # Niether
print(np.sum((X>np.nanmax(X)*.10) & (Y>np.nanmax(Y)*.10))) # CD4+CD8+

## Reference populations were found by gating CD4 and CD8 antibody expression and then high resolution clustering was performed. A spearman r was computed for each leiden cluster to assign population identity.

In [None]:
cdata = sc.read('Lupus_study_cytoTCell.h5ad', cache=True)
print(cdata)
print('Number of CD4+ cytotoxic cells: {}'.format(np.sum(cdata.obs['ct_cov'].values=='CD4+ Cytotoxic T Cells')))
print('Number of CD8+ cytotoxic cells: {}'.format(np.sum(cdata.obs['ct_cov'].values=='CD8+ Cytotoxic T Cells')))

In [None]:
sc.pl.umap(cdata, color=['CCR7', 'GZMB', 'PRF1', 'CD4', 'CD8A', 'ct_cov'], size=5)

In [None]:
def plot_gene_expression(cdata, celltypes, gene):
    indlist  = cdata.obs['ind_cov_disease_cov'].unique().tolist()
    ind_id = []; dz_id = []; ct_id = []; values = []; count = []
    for celltype in celltypes:
        for ii in range(len(indlist)):
            values.append(cdata.raw.X[(cdata.obs['ct_cov'].values==celltype) & (cdata.obs['ind_cov_disease_cov'].values==indlist[ii]), cdata.raw.var_names==gene].mean())
            count.append(cdata.raw.X[cdata.obs['ind_cov_disease_cov'].values==indlist[ii], cdata.raw.var_names==gene].shape[1])
            ind_id.append(indlist[ii])
            dz_id.append(cdata.obs['disease_cov'][cdata.obs['ind_cov_disease_cov']==indlist[ii]].values.unique()[0])
            ct_id.append(celltype)

    genexpression = pd.DataFrame(data={'Unique_ID': ind_id, 'disease_cov': dz_id, gene: values, 'Cell_Count':count, 'ct_cov':ct_id})    
    genexpression.ct_cov = genexpression.ct_cov.astype('category')
    perc_plot = sns.catplot(x='disease_cov', y=gene, order=[ "Healthy", "Managed", "Treated", "Flare"], hue='ct_cov', data=genexpression, kind='violin', col='ct_cov', col_wrap=2, cut=0, dodge=False, aspect=1, sharex=False, sharey=True)
    for ct_i in range(len(celltypes)):
        sns.swarmplot(x="disease_cov", y=gene, data=genexpression[genexpression.ct_cov==celltypes[ct_i]], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", size=6, ax=perc_plot.axes[ct_i])
        try:
            sns.pointplot(x="disease_cov", y=gene, hue="Unique_ID", data=genexpression[genexpression.Cell_Type==celltypes[ct_i]], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", scale=0.3, ax=perc_plot.axes[ct_i])
        except:
            continue
        perc_plot.axes[ct_i].get_xaxis().label.set_visible(False)
        perc_plot.axes[ct_i].get_legend().remove()
        perc_plot.set_xticklabels(rotation=90)
        perc_plot.fig.subplots_adjust(wspace=0.5, hspace = 1)

    return genexpression

def get_stats(dataframe, groups, gene):
    print('{} groups weighted by number of cells'.format(groups))
    all_out = pd.DataFrame();
    if 'SLE' in groups:
        ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Treated', 'Managed', 'Healthy', 'Flare'])]
    else:
        ind_perc0 = ind_perc[ind_perc.disease_cov.isin(groups)]
    Effect = {}
    for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
        ct = ind_perc0.ct_cov.cat.categories[ct_i]
        ct_perc = ind_perc0[gene][ind_perc0.ct_cov==ct]
        weights = ind_perc0.Cell_Count[ind_perc0.ct_cov==ct]
        disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
        disease = disease.astype("str")
        if 'SLE' in groups:
            disease.values[disease=='Healthy'] = 0
            disease.values[disease!='Healthy'] = 1
        else:
            disease.values[disease==groups[0]] = 0
            disease.values[disease==groups[1]] = 1
        disease = sm.add_constant(disease)
        est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
        est=est.fit()
        effect=est.params[1]
        pval=est.pvalues[1]
        Effect[ct] = effect
        all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
        all_out['Pval'] = all_out['Pval'].astype(float)
    display(all_out)
    


# Example gene

In [None]:
celltypes = ['CD4+ Cytotoxic T Cells', 'CD8+ Cytotoxic T Cells']
genes     = ['IFITM1', 'CD69']
#genes =['MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']
for ii in range(len(genes)):
    ind_perc = plot_gene_expression(cdata, celltypes, genes[ii])
    get_stats(ind_perc, ['Healthy', 'Managed'], genes[ii])
    
    

# Look at genes of interest

In [None]:
celltypes = ['CD4+ Cytotoxic T Cells', 'CD8+ Cytotoxic T Cells']
genes     = ['CD4', 'CD40LG', 'ZBTB7B', 'CD3G', 'TNFRSF8', 'TNFRSF4', 'TNFRSF9', 'ICOS', 'TBX21', 'CD8A', 'CD8B','CD27', 'PRF1', 'GZMH', 'GZMK', 'GZMM', 'RUNX3', 'PRDM1', 'CD28', 'KLRK1', 'CTLA4', 'PDCD1', 'BCL2', 'BCL2L11','BAX','EOMES']
for ii in range(len(genes)):
    plot_gene_expression(cdata, celltypes, genes[ii])
    get_stats(ind_perc, ['Healthy', 'Managed'], genes[ii])

In [None]:
genes = ['CD4', 'CD40LG', 'ZBTB7B', 'CD3G', 'TNFRSF8', 'TNFRSF4', 'TNFRSF9', 'ICOS', 'TBX21', 'CD8A', 'CD8B','CD27', 'PRF1', 'GZMH', 'GZMK', 'GZMM', 'RUNX3', 'PRDM1', 'CD28', 'KLRK1', 'CTLA4', 'PDCD1', 'BCL2', 'BCL2L11','BAX','EOMES']
sc.pl.stacked_violin(cdata, groupby='ct_cov', use_raw=True,var_names=genes, show=True, save='Cytogene.png')

# Differentially expressed genes between CD4+ and CD8+ cytotoxic populations

In [None]:
sc.tl.rank_genes_groups(cdata, groupby='ct_cov')
sc.pl.rank_genes_groups(cdata, n_genes=50, fontsize=5)

# Differentially expressed genes between CD4+ and CD8+ cytotoxic populations (only SLE patients)

In [None]:
ddata = cdata[cdata.obs['disease_cov'].isin(['Managed', 'Treated', 'Flare'])]
sc.tl.rank_genes_groups(ddata, groupby='ct_cov',n_genes=25)
sc.pl.rank_genes_groups(ddata, n_genes=25, fontsize=8)
df = pd.DataFrame(ddata.uns['rank_genes_groups']['names'])

In [None]:
genes = df['CD4+ Cytotoxic T Cells'].tolist() + df['CD8+ Cytotoxic T Cells'].tolist()
for ii in range(len(genes)):
    plot_gene_expression(cdata, celltypes, genes[ii])
    #get_stats(ind_perc, ['Healthy', 'Managed'], genes[ii])

# Differentially expressed genes between SLE and Healthy for CD4+ cytotoxic cells

In [None]:
ddata = cdata[cdata.obs['ct_cov']=='CD4+ Cytotoxic T Cells']
sc.tl.rank_genes_groups(ddata, groupby='disease_cov',groups=['Healthy', 'Managed', 'Treated', 'Flare'],reference='Healthy', n_genes=25)
sc.pl.rank_genes_groups(ddata, n_genes=25)
sc.pl.rank_genes_groups_stacked_violin(ddata, groupby='disease_cov', n_genes=25,dendrogram=True)


In [None]:
df = pd.DataFrame(ddata.uns['rank_genes_groups']['names'])
genes = df['Managed'].tolist() + df['Treated'].tolist()+df['Flare'].tolist()
for ii in range(len(genes)):
    plot_gene_expression(cdata, celltypes, genes[ii])

# Differentially expressed genes between SLE and Healthy for CD8+ cytotoxic cells

In [None]:
ddata = cdata[cdata.obs['ct_cov']=='CD8+ Cytotoxic T Cells']
sc.tl.rank_genes_groups(ddata, groupby='disease_cov',groups=['Healthy', 'Managed', 'Treated', 'Flare'],reference='Healthy', n_genes=50)
sc.pl.rank_genes_groups(ddata, n_genes=25)

# Gene rank scores for CD8+ cytotoxic vs. CD4+ cytotoxic and Healthy versus SLE

In [None]:
# Scores for differentially expressed genes in disease status
sc.tl.rank_genes_groups(cdata, groupby='disease_cov', n_genes=len(mastergenes), groups=['Healthy', 'Managed', 'Treated', 'Flare'], reference='Healthy')
genescores = cdata.uns['rank_genes_groups']['scores']
genenames = cdata.uns['rank_genes_groups']['names']
df = pd.DataFrame(genenames)
dfscore = pd.DataFrame(genescores)
dfscore = dfscore.rename(columns={'Managed':'Managed_score', 'Treated': 'Treated_score', 'Flare': 'Flare_score'})

dfmanaged = pd.concat([df['Managed'], dfscore['Managed_score']], axis=1)
dfmanaged = dfmanaged.set_index('Managed')
dfmanaged.index.rename(name='Gene', inplace=True)

dftreated = pd.concat([df['Treated'], dfscore['Treated_score']], axis=1)
dftreated = dftreated.set_index('Treated')
dftreated.index.rename(name='Gene', inplace=True)

dfflare = pd.concat([df['Flare'], dfscore['Flare_score']], axis=1)
dfflare = dfflare.set_index('Flare')
dfflare.index.rename(name='Gene', inplace=True)

# Scores for differentially expressed genes for CD8+ and CD4+ Cytotoxic T cells
sc.tl.rank_genes_groups(cdata, groupby='ct_cov', n_genes=len(mastergenes),groups=['CD4+ Cytotoxic T Cells', 'CD8+ Cytotoxic T Cells'], reference='CD4+ Cytotoxic T Cells')
genescores = cdata.uns['rank_genes_groups']['scores']
genenames = cdata.uns['rank_genes_groups']['names']
df2 = pd.DataFrame(genenames)
df2score = pd.DataFrame(genescores)
df2score = df2score.rename(columns={'CD8+ Cytotoxic T Cells':'CD4+<->CD8+_score'})
df2 = pd.concat([df2, df2score], axis=1)
df2 = df2.set_index('CD8+ Cytotoxic T Cells')
df2.index.rename(name='Gene', inplace=True)


dfmanaged = pd.concat([dfmanaged, df2], axis=1)
dfmanaged = dfmanaged.sort_values(by=['Managed_score', 'CD4+<->CD8+_score'])
dftreated = pd.concat([dftreated, df2], axis=1)
dftreated = dftreated.sort_values(by=['Treated_score', 'CD4+<->CD8+_score'])
dfflare = pd.concat([dfflare, df2], axis=1)
dfflare = dfflare.sort_values(by=['Flare_score', 'CD4+<->CD8+_score'])


In [None]:
sns.scatterplot(x='CD4+<->CD8+_score', y='Managed_score', data=dfmanaged, label='Managed')
sns.scatterplot(x='CD4+<->CD8+_score', y='Treated_score', data=dftreated, label='Treated')
sns.scatterplot(x='CD4+<->CD8+_score', y='Flare_score', data=dfflare, label='Flare')
plt.ylabel('Healthy vs. SLE')
plt.xlabel('CD4+ vs CD8+')
plt.legend()

# Rename cytotoxic cells in main dataframe

In [None]:
CD4 = cdata[cdata.obs['ct_cov']=='CD4+ Cytotoxic T Cells'].obs_names.tolist()
CD8 = cdata[cdata.obs['ct_cov']=='CD8+ Cytotoxic T Cells'].obs_names.tolist()
adata.obs['ct_cov'] = adata.obs['ct_cov'].astype('object')
adata.obs['ct_cov'][adata.obs_names.isin(CD4)] = 'CD4+ Cytotoxic T Cells'
adata.obs['ct_cov'][adata.obs_names.isin(CD8)] = 'CD8+ Cytotoxic T Cells'
adata.obs['ct_cov'] = adata.obs.ct_cov.astype('category')

In [None]:
MasterORDER = ['Classical Monocytes','Nonclassical Monocytes', 'Classical Dendritic Cells', 'Plasmacytoid Dendritic Cells', 'Naive T Cells', 'Effector Memory T Cells', 'CD8+ Cytotoxic T Cells', 'CD4+ Cytotoxic T Cells', 'Proliferating T Cells', 'Natural Killer Cells', 'B Cells', 'Progenitor Cells', 'Megakaryocytes', 'RBCs']
#colorrs = ["#E58606","#5D69B1","#52BCA3","#99C945","#CC61B0","#24796C","#DAA51B","#2F8AC4","#764E9F","#ED645A","#CC3A8E",'#BC23FF', '#D790FF']
colorrs = ["#4E79A7","#A0CBE8","#F28E2B","#FFBE7D","#8CD17D","#B6992D","#499894","#E15759","#FF9D9A","#79706E","#D37295","#FABFD2","#B07AA1","#D4A6C8","#9D7660",
                 "#E58606", "#5D69B1", "#24796C", '#DAA51B', '#000000', '#99C945', '#ED645A']

adata.obs['ct_cov'] = adata.obs['ct_cov'].cat.reorder_categories(MasterORDER)
adata.uns['ct_cov_colors'] = colorrs
celltype_umap = sc.pl.umap(adata, color='ct_cov', show=True, size=3, edgecolor="none")

In [None]:
## Make proportion plots
adata_obs_small = adata.obs
ind_count = adata_obs_small.groupby(['ind_cov_disease_cov','ct_cov','disease_cov','pop_cov', 'ind_cov'])['ct_cov'].count()
ind_count_sums = ind_count.groupby(level=[0]).sum()
ind_count_sums = ind_count_sums.reset_index(name="counts")
ind_perc = ind_count/ind_count.groupby(level=[0]).transform(sum)*100
ind_perc = ind_perc.reset_index(name="ct_perc")
# Add weights to WLS
ind_perc['counts'] = ind_count.values.tolist()

ind_perc['ind_count_sum'] = list(np.zeros(len(ind_count.values.tolist()),dtype=int))
# Add total sums per individual to structure
for ii in range(len(ind_count_sums)):
    ind_perc['ind_count_sum'][ind_perc.ind_cov_disease_cov==ind_count_sums.ind_cov_disease_cov[ii]] = ind_count_sums.counts[ii]

# Filter samples that have < 100 cells
ind_perc = ind_perc[ind_perc.ind_cov_disease_cov.isin(ind_count_sums.ind_cov_disease_cov[ind_count_sums.counts > 100])]


ind_perc.ind_cov_disease_cov = ind_perc.ind_cov_disease_cov.apply(lambda x: x.split('_')[0])
ind_count_sums.ind_cov_disease_cov = ind_count_sums.ind_cov_disease_cov.apply(lambda x: x.split('_')[0])
#ind_perc = ind_perc.set_index('ct_cov').join(cg_cov.set_index('ct_cov'))
ind_perc.ind_cov_disease_cov = ind_perc.ind_cov_disease_cov.astype("str")
ind_perc.reset_index(inplace=True)
ind_perc.ct_cov = ind_perc.ct_cov.astype('category')
ind_perc.ct_cov = ind_perc.ct_cov.cat.reorder_categories(adata.obs.ct_cov.cat.categories.values)

perc_plot = sns.catplot(x='disease_cov', y='ct_perc', order=[ "Healthy", "Managed", "Treated", "Flare"], hue='ct_cov', data=ind_perc, kind='violin', col_order=MasterORDER, col='ct_cov', col_wrap=3, cut=0, dodge=False, aspect=1, sharex=False, sharey=False, palette=colorrs)

for ct_i in list(range(len(MasterORDER))):
    ct = MasterORDER[ct_i]
    sns.swarmplot(x="disease_cov", y="ct_perc", data=ind_perc[ind_perc.ct_cov == ct], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", size=4, ax=perc_plot.axes[ct_i])
    try:
        sns.pointplot(x="disease_cov", y="ct_perc", hue="ind_cov", data=ind_perc[ind_perc.ct_cov == ct], order=[ "Healthy", "Managed", "Treated", "Flare"], color="0", scale=0.3, ax=perc_plot.axes[ct_i])
    except:
        continue
    perc_plot.axes[ct_i].get_yaxis().label.set_visible(False)
    perc_plot.axes[ct_i].get_xaxis().label.set_visible(False)
    perc_plot.axes[ct_i].get_legend().remove()
    perc_plot.set_xticklabels(rotation=90)
    perc_plot.fig.subplots_adjust(wspace=2, hspace = 1)
    
perc_plot.savefig(figdir+"/violin.ct_cov_figure6b.png")
perc_plot.savefig(figdir+"/violin.ct_cov_figure6b.pdf")

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Managed WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Managed'])]
HEALTHYMANAGED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Managed"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYMANAGED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Treated WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated'])]
HEALTHYTREATED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Treated"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYTREATED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Flare'])]
HEALTHYFLARE_effect = {}
HEALTHYFLARE_FC = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Flare"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    HEALTHYFLARE_effect[ct] = effect
    HEALTHYFLARE_FC[ct] = math.log2(sum(est.params)/est.params[0])
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Treated vs. Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Treated', 'Flare'])]
TREATEDFLARE_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Flare"] = 0
    disease.values[disease=="Treated"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    TREATEDFLARE_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Treated vs. Managed WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Treated', 'Managed'])]
TREATEDMANAGED_effect = {}
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease=="Managed"] = 0
    disease.values[disease=="Treated"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    TREATEDMANAGED_effect[ct] = effect
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. SLE WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated', 'Untreated', 'Managed'])]
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease!="Healthy"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)

# Statistical tests...
print('as a proportion of total PBMC: Healthy vs. Treated&Flare WEIGHTED BY TOTAL PBMC COUNTS')
all_out = pd.DataFrame();
ind_perc0 = ind_perc[ind_perc.disease_cov.isin(['Healthy', 'Treated', 'Untreated'])]
for ct_i in list(range(len(ind_perc0.ct_cov.cat.categories))):
    ct = ind_perc0.ct_cov.cat.categories[ct_i]
    ct_perc = ind_perc0.ct_perc[ind_perc0.ct_cov==ct]
    weights = ind_perc0.ind_count_sum[ind_perc0.ct_cov==ct]
    disease = ind_perc0.disease_cov[ind_perc0.ct_cov==ct]
    disease = disease.astype("str")
    disease.values[disease!="Healthy"] = 0
    disease.values[disease=="Healthy"] = 1
    disease = sm.add_constant(disease)
    est=sm.WLS(ct_perc.astype(float), disease.astype(float), weights=weights)
    #est=sm.OLS(ct_perc.astype(float), disease.astype(float))
    est=est.fit()
    effect=est.params[1]
    pval=est.pvalues[1]
    all_out = all_out.append(({"Cell":str(ct),"Beta":str(effect),"Pval":str(pval)}), ignore_index=True)
    all_out['Pval'] = all_out['Pval'].astype(float)
display(all_out)


# Gene Ontology analysis for each cell type

In [None]:
from __future__ import print_function
from Bio import Entrez
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hsa
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_gos, plot_results, plot_goid2goobj
# Store gene summaries
summary_dict = {}
def getgeneids(gene_list):
    Entrez.email = "richard.perez@ucsf.edu"
    # Store symbol and id
    geneid2symbol = {}
    for genesymbol in gene_list:
        try:
            handle = Entrez.esearch(db="gene", term=str(genesymbol + "[GENE] AND Homo"))
            record = Entrez.read(handle)
            geneid2symbol[int(record['IdList'][0])] = genesymbol
            handle.close()
        except:
            continue
    return geneid2symbol

def id2symbol(GENES, celltypes, cdata):
    for geneID in GENES:
        Entrez.email = "richard.perez@ucsf.edu"
        id_list= [str(geneID)]
        request = Entrez.epost("gene",id=",".join(id_list))
        result = Entrez.read(request)
        webEnv = result["WebEnv"]
        queryKey = result["QueryKey"]
        data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
        annotations = Entrez.read(data)
        symbol  = annotations['DocumentSummarySet']['DocumentSummary'][0]['Name']
        summary = annotations['DocumentSummarySet']['DocumentSummary'][0]['Summary']
        print(symbol)
        print(summary)
        ind_perc = plot_gene_expression(cdata, celltypes, symbol)
    return symbol, summary

def id2symbolfast(gene_id):
    Entrez.email = "richard.perez@ucsf.edu"
    id_list= [str(gene_id)]
    request = Entrez.epost("gene",id=",".join(id_list))
    result = Entrez.read(request)
    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key=queryKey)
    annotations = Entrez.read(data)
    symbol  = annotations['DocumentSummarySet']['DocumentSummary'][0]['Name']
    summary = annotations['DocumentSummarySet']['DocumentSummary'][0]['Summary']
    return symbol, summary

def replace_id_with_symbol(results, summary_dict):
    for ll in range(len(results['study_items'])):
        study_item = np.asarray(results['study_items'][ll].split(','), dtype=int)
        # Keep all gene summaries
        for ii in range(len(study_item)):
            symbol, summary = id2symbolfast(study_item[ii])
            summary_dict[symbol] = summary
        # Replace gene ID with gene symbol
        study_item = [id2symbolfast(study_item[ii])[0] for ii in range(len(study_item))]
        results['study_items'][ll]= study_item
    return results, summary_dict

def run_GO(gene_list, cutoff, title):
    obo_fname = download_go_basic_obo()
    fin_gene2go = download_ncbi_associations()
    obodag = GODag("go-basic.obo")

    # Read NCBI's gene2go. Store annotations in a list of namedtuples
    objanno = Gene2GoReader(fin_gene2go, taxids=[9606])

    # Get namespace2association where:
    #    namespace is:
    #        BP: biological_process               
    #        MF: molecular_function
    #        CC: cellular_component
    #    assocation is a dict:
    #        key: NCBI GeneID
    #        value: A set of GO IDs associated with that gene
    ns2assoc = objanno.get_ns2assc()

    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))

    goeaobj = GOEnrichmentStudyNS(
            GeneID2nt_hsa.keys(), # List of human protein-coding genes
            ns2assoc, # geneid/GO associations
            obodag, # Ontologies
            propagate_counts = False,
            alpha = cutoff, # default significance cut-off
            methods = ['fdr_bh']) # defult multipletest correction method

    geneid2symbol = getgeneids(gene_list)
    # 'p_' means "pvalue". 'fdr_bh' is the multiple test method
    geneids_study = geneid2symbol.keys()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < cutoff]
    '''
    This plot contains GOEA results:

    GO terms colored by P-value:
    pval < 0.005 (light red)
    pval < 0.01 (light orange)
    pval < 0.05 (yellow)
    pval > 0.05 (grey) Study terms that are not statistically significant
    GO terms with study gene counts printed. e.g., "32 genes"
    '''
    plot_results(title+"{NS}.png", goea_results_sig, id2symbol=geneid2symbol, study_items=20, items_p_line=5)
    goeaobj.wr_xlsx(title+".xlsx", goea_results_sig, id2symbol=geneid2symbol)
    return geneid2symbol, goea_results_sig

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
ct_groups = np.unique(adata.obs['leiden'].tolist())
for ii in range(len(ct_groups)):
    try:
        ddata = adata[adata.obs['leiden']==ct_groups[ii]]
        sc.tl.rank_genes_groups(ddata, groupby='SLE status',groups=['Healthy', 'SLE'], reference='Healthy', n_genes=len(ddata.raw.var_names))
        sc.pl.rank_genes_groups(ddata, n_genes=25, save='{}_SLEvsHealthy.png'.format(ct_groups[ii]))
        genes = pd.DataFrame(ddata.uns['rank_genes_groups']['names'])
        scores = pd.DataFrame(ddata.uns['rank_genes_groups']['scores'])
        gene_list = list(np.ravel(genes[scores.values>(scores.mean()+(scores.std()*3))[0]].values.tolist()))
        cutoff = 0.001
        title = str(ct_groups[ii])+'_Lupus_Study'
        gene_list = list(np.ravel(genes[scores.values>(scores.mean()+(scores.std()*3))[0]].values.tolist()))
        run_GO(gene_list,cutoff,title)
        results = pd.read_excel(title+".xlsx")
        #results, summary_dict = replace_id_with_symbol(results, summary_dict)
        print(ct_groups[ii])
        print('Number of genes 3 standard deviations above mean: {}'.format(np.sum(scores.values>(scores.mean()+(scores.std()*3))[0])))
        display(results)
    except:
        continue

# EBV specific T cells: CD45RA-, HLA-DR+, CD38+ low SELL (CD62L)

In [None]:
sc.pl.umap(adata, color=['CD69', 'HLA-DRA', 'HLA-DRB1', 'CD38', 'CD8A', 'IFNG', 'SELL', 'CD7', 'CD5', 'PAX5'], size=3)

In [None]:
sc.pl.umap(adata, color=['CD45RA|PTPRC|j95-28|pAbO', 'CD45RO|PTPRC|j95-19|pAbO', 'CD38|CD38|j95-01|pAbO', 'HLA-DR|CD74|j95-18|pAbO'], size=10)



In [None]:
bdata = adata[adata.obs['ct_cov'].isin(['CD8+ Cytotoxic T Cells'])]
X = np.asarray(bdata.obs['CD45RA|PTPRC|j95-28|pAbO'].tolist())
Y = np.asarray(bdata.obs['HLA-DR|CD74|j95-18|pAbO'].tolist())
f, ax = plt.subplots(figsize=(7, 7))
ax = sns.scatterplot(X,Y)
plt.xlabel('CD45RA')
plt.ylabel('HLA-DR')
joint_kws=dict(gridsize=50)
sns.jointplot(X, Y, kind="hex", color="b", joint_kws= joint_kws);

# helper T cell population distinct from TFH cells from Nature Medicine 2019

In [None]:
sc.pl.umap(adata, color=['IL10', 'PDCD1', 'CD4', 'CXCR3', 'ct_cov'], size=10)