In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib.gridspec import GridSpec
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=12

In [None]:
prefix = '/data/codec/production.run/mrna/'

In [None]:
path = prefix + 'obs/cts/br.t.txt'

# with open(path,'w') as file:
#     for bc in acg_nk.obs_names:
#         file.write(bc + '\n')
        
with open(path,'r') as file:
    br_t_cells = [i.strip() for i in file.readlines()]

In [None]:
path = prefix + '../cond.colors.pkl'

with open(path,'rb') as file:
    cond_colors = pkl.load(file)

In [None]:
path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

# Processing

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

In [None]:
br_t = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[br_t_cells].copy()

In [None]:
br_t.var['n_counts'] = br_t.X.toarray().sum(axis=0)

In [None]:
plt.figure(figsize=(8,6))
plt.hist(br_t.var['n_counts'].values, bins=np.logspace(np.log10(1),np.log10(1e5), 1000))
plt.grid(False)
plt.grid(True, 'both', 'both')
plt.xscale('log')
# plt.yscale('log')

In [None]:
# remove any genes that are now empty
sc.pp.filter_genes(br_t, min_counts=100, inplace=True)
sc.pp.normalize_per_cell(br_t, counts_per_cell_after=1e6)
sc.pp.log1p(br_t)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
for vals, ax in zip(['percent_mito','n_counts'], np.ravel(ax)):
    ax.hist(br_t.obs[vals].values,bins=100)

In [None]:
sc.pp.scale(br_t)
sc.pp.combat(br_t, key='batch',covariates=['cond','free_id'])
sc.pp.scale(br_t)
sc.pp.pca(br_t, n_comps=100)

In [None]:
sc.pl.pca_variance_ratio(br_t,log=True, n_pcs=100)

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(br_t,n_neighbors=15,n_pcs=50)
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(br_t)

In [None]:
sc.tl.leiden(br_t, resolution=1)

# Initial Visualization

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(br_t,color=color, ax=ax, show=False, return_fig=False, size=4, palette=palette)

In [None]:
fig, ax = plt.subplots(1,4,figsize=(24,5))
for f, ax in zip(['batch', 'percent_mito', 'n_counts', 'free_id'], np.ravel(ax)):
    ax.set_facecolor('black')
    sc.pl.umap(br_t, color=f, ax=ax,show=False, return_fig=False, size=3, legend_loc=None);

In [None]:
fig, ax = plt.subplots(2,6, figsize=(30,10))
for val, ax in tqdm(zip(br_t.obs['batch'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['batch'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,7, figsize=(30,4))
for val, ax in tqdm(zip(br_t.obs['cond'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(2,7,figsize=(25,6.5))
for val, ax in tqdm(zip(br_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

### Marker Genes and Protein Data

Add the `.raw` attribute for visualization and gene tests.

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(br_t)

In [None]:
def add_proteins(adata):
    
    def clr_normalize_column(x):
        normed_column = np.log1p((x) / (np.exp(sum(np.log1p((x)[x > 0 ])) / len(x + 1))))
        return normed_column
    def clr_normalize(x):
        normed_matrix = np.apply_along_axis(clr_normalize_column, 1, x)
        return normed_matrix
    
    path = '/data/codec/production.run/adts/pkls/concat.adts.norm.pkl'
    with open(path,'rb') as file:
        concat_adts = pkl.load(file)
    
    bcs = adata.obs_names
    num = len(bcs)
    concat_adts = concat_adts[bcs,:].copy()
    concat_adts.obs = concat_adts.obs.join(adata.obs[['cond','free_id']])
    
    def clip_extreme(a):
        a_sort = np.sort(a)
        a_min = max(a_sort[4], np.percentile(a, 0.5, interpolation='higher'))
        a_max = min(a_sort[-5], np.percentile(a, 99.5, interpolation='lower'))
        return np.clip(a, a_min=a_min, a_max=a_max)
    
    sc.pp.normalize_per_cell(concat_adts, counts_per_cell_after=1e6)
    concat_adts.X = clr_normalize(concat_adts.X.toarray())
    sc.pp.scale(concat_adts)
    sc.pp.combat(concat_adts, key='batch',covariates=['cond','free_id'])
    sc.pp.scale(concat_adts)
    concat_adts.X = np.apply_along_axis(clip_extreme, axis=0, arr=concat_adts.X)
    adts_df = pd.DataFrame(concat_adts.X, columns=concat_adts.var_names, index=concat_adts.obs_names)
    try:
        adata.obs.drop(columns=concat_adts.var_names, inplace=True)
    except KeyError:
        None
    adata.obs = adata.obs.join(adts_df)
    return

Add the protein data into the `.obs`.

In [None]:
add_proteins(br_t)

Find out where are expected cell types are.

In [None]:
features = ['CD8|CD8A', 'CD4|CD4', 'CD45RO|PTPRC', 'CD45RA|PTPRC',
           'TRDC','FOXP3','CCR4', 'CD69|CD69',
           'CD185|CXCR5', 'CD62L|SELL','CXCR3','CCR6',
           'CD26|DPP4','CCR7','CCL5','CD103|ITGAE']

In [None]:
fig, ax = plt.subplots(4,4,figsize=(25,24))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(br_t, color=f, ax=ax,show=False, return_fig=False, size=5, use_raw=True)
plt.tight_layout()

In [None]:
path = prefix + 'pkls/aggr/br_t/br_t.1.pkl'

# with open(path,'wb') as file:
#     pkl.dump(br_t, file, protocol=4)
    
with open(path,'rb') as file:
    br_t = pkl.load(file)

# Grouping and Subclustering
Going to combine or break up those clusters further.

In [None]:
groupings = [[3, 4, 6]]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(br_t.obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

br_t.obs['celltype'] = br_t.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        br_t.obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
br_t.obs['leiden'] = [i.strip('ct') for i in br_t.obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(br_t, color='leiden', size=2)

In [None]:
clusts = [0, 1, 2, 4, 5, 8]

In [None]:
sub_br_t = dict()
for clust in clusts:
    sub_br_t[str(clust)] = br_t[br_t.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_br_t['0'], resolution=0.5) # subcluster them using Leiden
sc.tl.leiden(sub_br_t['0'], resolution=0.4, restrict_to=('leiden',['3'])) # subcluster them using Leident['0'], resolution=0.3, restrict_to=('leiden',['0,4'])) # subcluster them using Leiden
stupid_name_clusts = sub_br_t['0'].obs['leiden'].dtype.categories
for i, j in zip(stupid_name_clusts, range(len(stupid_name_clusts))):
    sub_br_t['0'].obs['leiden'].replace(i, j, inplace=True)
sub_br_t['0'].obs['leiden'] = sub_br_t['0'].obs['leiden'].astype(int)
sub_br_t['0'].obs['leiden'] = sub_br_t['0'].obs['leiden'].astype(str)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['0'], color=f, ax=ax,show=False, return_fig=False, size=10, use_raw=True)
plt.tight_layout()

In [None]:
groupings = [[0, 1, 2, 3, 5, 6],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_br_t['0'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_br_t['0'].obs['celltype'] = sub_br_t['0'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_br_t['0'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_br_t['0'].obs['leiden'] = [i.strip('ct') for i in sub_br_t['0'].obs['celltype'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['0'], color=f, ax=ax,show=False, return_fig=False, size=10, use_raw=True)
plt.tight_layout()

In [None]:
sc.tl.leiden(sub_br_t['1'], resolution=0.5) # subcluster them using Leiden
sc.tl.leiden(sub_br_t['1'], resolution=0.3, restrict_to=('leiden',['2'])) # subcluster them using Leident['1'], resolution=0.3, restrict_to=('leiden',['0,4'])) # subcluster them using Leiden
stupid_name_clusts = sub_br_t['1'].obs['leiden'].dtype.categories
for i, j in zip(stupid_name_clusts, range(len(stupid_name_clusts))):
    sub_br_t['1'].obs['leiden'].replace(i, j, inplace=True)
sub_br_t['1'].obs['leiden'] = sub_br_t['1'].obs['leiden'].astype(int)
sub_br_t['1'].obs['leiden'] = sub_br_t['1'].obs['leiden'].astype(str)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['1'], color=f, ax=ax,show=False, return_fig=False, size=10, use_raw=True)
plt.tight_layout()

In [None]:
features = ['CD8|CD8A', 'CD4|CD4', 'CCR4', 'PTGDR2', 
            'CD185|CXCR5','CXCR3','CCR6', 'CCL5'
           ]

In [None]:
fig, ax = plt.subplots(2,4,figsize=(25,13))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['0'], color=f, ax=ax,show=False, return_fig=False, size=8, use_raw=True)
plt.tight_layout()

In [None]:
groupings = [[0, 3, 4, 6],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_br_t['1'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_br_t['1'].obs['celltype'] = sub_br_t['1'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_br_t['1'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_br_t['1'].obs['leiden'] = [i.strip('ct') for i in sub_br_t['1'].obs['celltype'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['1'], color=f, ax=ax,show=False, return_fig=False, size=10, use_raw=True)
plt.tight_layout()

In [None]:
sc.tl.leiden(sub_br_t['2'], resolution=0.4) # subcluster them using Leiden
sc.pl.umap(sub_br_t['2'],color='leiden', size=5)

In [None]:
groupings = [[0, 1, 2],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_br_t['2'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_br_t['2'].obs['celltype'] = sub_br_t['2'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_br_t['2'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_br_t['2'].obs['leiden'] = [i.strip('ct') for i in sub_br_t['2'].obs['celltype'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_br_t['2'], color=f, ax=ax,show=False, return_fig=False, size=10, use_raw=True)
plt.tight_layout()

In [None]:
sc.tl.leiden(sub_br_t['4'], resolution=0.3) # subcluster them using Leiden
sc.tl.leiden(sub_br_t['4'], resolution=0.25, restrict_to=('leiden',['1'])) # subcluster them using Leident['4'], resolution=0.3, restrict_to=('leiden',['0,4'])) # subcluster them using Leiden
stupid_name_clusts = sub_br_t['4'].obs['leiden'].dtype.categories
for i, j in zip(stupid_name_clusts, range(len(stupid_name_clusts))):
    sub_br_t['4'].obs['leiden'].replace(i, j, inplace=True)
sub_br_t['4'].obs['leiden'] = sub_br_t['4'].obs['leiden'].astype(int)
sub_br_t['4'].obs['leiden'] = sub_br_t['4'].obs['leiden'].astype(str)
sc.pl.umap(sub_br_t['4'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_br_t['5'], resolution=0.3) # subcluster them using Leiden
sc.tl.leiden(sub_br_t['5'], resolution=0.3, restrict_to=('leiden',['1'])) # subcluster them using Leident['5'], resolution=0.3, restrict_to=('leiden',['0,4'])) # subcluster them using Leiden
stupid_name_clusts = sub_br_t['5'].obs['leiden'].dtype.categories
for i, j in zip(stupid_name_clusts, range(len(stupid_name_clusts))):
    sub_br_t['5'].obs['leiden'].replace(i, j, inplace=True)
sub_br_t['5'].obs['leiden'] = sub_br_t['5'].obs['leiden'].astype(int)
sub_br_t['5'].obs['leiden'] = sub_br_t['5'].obs['leiden'].astype(str)
sc.pl.umap(sub_br_t['5'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_br_t['8'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_br_t['8'],color='leiden', size=5)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
br_t = sub_cluster_mapper(br_t, sub_br_t)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color=color, ax=ax, show=False, return_fig=False, size=10, palette=palette)

In [None]:
fig, ax = plt.subplots(3,7,figsize=(20,9))
for val, ax in tqdm(zip(br_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

# Remove Contamination

See note in the `acg_t` notebook about this.

In [None]:
br_t = br_t[~br_t.obs['leiden'].isin(['6', '7' ,'10', '11', '15']),:].copy()

Renumber clusters:

In [None]:
clusts = br_t.obs['leiden'].dtype.categories
clust_dict = dict(zip(clusts, range(len(clusts))))
new_leiden = list()
for i in br_t.obs['leiden']:
    new_leiden.append(clust_dict[i])
new_leiden = np.array(new_leiden)
br_t.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
br_t.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
# br_t.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots

# Visualization Post-Subclustering

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color=color, ax=ax, show=False, return_fig=False, size=10, palette=palette)

In [None]:
fig, ax = plt.subplots(1,6, figsize=(30,5))
for val, ax in tqdm(zip(br_t.obs['cond'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(4,4,figsize=(20,19))
for val, ax in tqdm(zip(br_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    br_t.obs['val'] = br_t.obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    br_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
features = ['CD8|CD8A', 'CD4|CD4', 'CD45RO|PTPRC', 'CD45RA|PTPRC',
           'TRDC', 'FOXP3', 'CCR4', 'CD69|CD69',
           'CD185|CXCR5', 'CD62L|SELL','CXCR3','CCR6',
           'CD26|DPP4', 'CCR7', 'CCL5', 'CD103|ITGAE']

In [None]:
fig, ax = plt.subplots(4,4,figsize=(25,24))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(br_t, color=f, ax=ax,show=False, return_fig=False, size=5, use_raw=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color=color, ax=ax, show=False, return_fig=False, size=10, palette=palette)

In [None]:
path = prefix + 'pkls/aggr/br_t/br_t.2.pkl'

# with open(path,'wb') as file:
#     pkl.dump(br_t, file, protocol=4)
    
with open(path,'rb') as file:
    br_t = pkl.load(file)

# Explore Differential Gene Expression

## Rank Genes

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(br_t, groupby='leiden', n_genes=100, use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(br_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(br_t, groupby='leiden', n_genes=100, groups=['14'], reference='5', use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(br_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

## Grouped Rank

Here's a function to group different clusters together to observe their combined differential expression to other clusters.

In [None]:
def grouped_rank(adata, groups, return_genes=True, size=5, n_genes=20, figsize=(5,5)):

    assert len(groups) == 2
    for i in range(2):
        groups[i] = np.array(groups[i]).astype(str)
        
    grouped_clusts = [i for j in groups for i in j]
    numclusts = np.unique(adata.obs['leiden'].values)
    groups.append(np.setdiff1d(numclusts, grouped_clusts).astype('<U21'))
    fig, axes = plt.subplots(1, 2, figsize=(9, 4))
    for clusts, title, ax in zip(groups, ['0','1'], axes):
        adata.obs['val'] = adata.obs['leiden'].isin(clusts).values
        ax.set_facecolor('black')
        sc.pl.umap(adata,color='val', ax=ax, size=size, show=False, return_fig=False, title=title)
    adata.obs.drop(columns='val', inplace=True)

    adata.obs['rank_compare'] = adata.obs['leiden'].copy()
    groups_dict = dict(zip(range(3), groups))
    for group in groups_dict:
        for clust in groups_dict[group]:
            adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)
    adata.obs['rank_compare'] = adata.obs['rank_compare'].str.replace('ct','').astype('category')
    
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    sc.settings.verbosity = 0
    for ax in np.ravel(ax):
        rank_adata = adata.copy()
        sc.tl.rank_genes_groups(rank_adata, groupby='rank_compare', n_genes=n_genes, groups=['0'], reference='1', use_raw=True)
        y = range(n_genes)[::-1]
        x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()][:n_genes]
        genes = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
        txts = genes[:n_genes]
        ax.scatter(x, y, s=0)
        for i, txt in enumerate(txts):
            ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
        ax.set_yticklabels([])
        ax.set_title('0 vs 1')
    sc.settings.verbosity = 4
    adata.obs.drop(columns='rank_compare', inplace=True)
    if return_genes == True:
        return genes
    else:
        return 

In [None]:
g1 = [4]
g2 = [i for i in range(20) if i not in g1]

In [None]:
g1 = [str(i) for i in g1]
g2 = [str(i) for i in g2]

In [None]:
genes = grouped_rank(br_t,groups=[g1, g2], size=5, n_genes=20, figsize=(5,5))

### Low-Count Differential Expression

I've noticed the default `rank_genes_groups` seems to be biased to very high expressing genes, when sometimes the gene is very specifically expressed in one population but at very low counts. The score is supposed to be based on significance (which should account for the specificity), but a lot of the time its just genes that are expressed everywhere but slightly higher. Here, I write a function to generate a dataframe where at least _p_ percent of cells are expressing the genes (> 0 counts), and compare this percentage to all other clusters to see which genes are being expressed specifically, allbeit lowly, in a specific population.

In [None]:
def low_de_make(adata, p=0.2, p_of='any'):
    '''
    adata = adata object with a raw attribute
    p = percentage of cells that should be expressing a gene for it to be reported
    p_of = when subsetting genes to report, at least p percent of cells in ANY cluster ('any') \n
    should be expressing the gene, or only in the provided cluster
    '''
    X = adata.raw.X.copy()
    clusts = adata.obs['leiden'].dtype.categories
    df = pd.DataFrame(0,index=clusts, columns=adata.raw.var_names.values,dtype=np.float16)
    for clust in clusts:
        clustbool = (adata.obs['leiden'] == clust).values
        clustX = X[clustbool]
        clustX = clustX.tocsc()
        df.loc[clust] = clustX.getnnz(axis=0)/clustX.shape[0]
    if p_of == 'any':
        df = df.iloc[:,df.apply(lambda x: np.any(x > p), axis=0, raw=True).values]
    else:
        df = df.iloc[:,(df.loc[p_of] > p).values]
    return df

def low_de_compare(df, clust, compare=None):
    if type(compare) == type(None): 
        return (df.loc[clust]/df[~(df.index == clust)].mean()).sort_values(ascending=False)
    elif type(compare) == list:
        return (df.loc[clust]/df[(df.index.isin(compare))].mean()).sort_values(ascending=False)

In [None]:
df = low_de_make(br_t, p=0.1, p_of=g1[0])

In [None]:
de_genes = low_de_compare(df, g1[0], compare=g2)

In [None]:
print(de_genes[:10])

# Gene Visualization

### Check Gene Names

In [None]:
raw_transcripts = br_t.raw.var_names
transcripts = br_t.var_names
proteins = [i for i in br_t.obs.columns if '|' in i]

In [None]:
name = 'CCL25'
print([i for i in proteins if name in i])
print([i for i in transcripts if name in i])
print([i for i in raw_transcripts if name in i])

In [None]:
proteins.index('CD3|CD3E') 
# proteins.index('CD94|KLRD1')

### Proteins

In [None]:
fig, ax = plt.subplots(12,8,figsize=(30,40))
for p, ax in tqdm(zip(proteins, np.ravel(ax))):
    ax.set_facecolor('black')
    sc.pl.umap(br_t, color=p, size=8, ncols=8, ax=ax, show=False, return_fig=False)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(20,4.25))
for color, ax, palette in zip(['CD4|CD4', 'CD8|CD8A','CD3|CD3E', 'leiden'], ax, [None, None, None, sc.pl.palettes.default_20]):
    ax.set_facecolor('black')
    sc.pl.umap(br_t,color=color, ax=ax, show=False, return_fig=False, size=5, palette=palette)

# Single Gene Plotter

In [None]:
f = ['CD45RO|PTPRC']

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.set_facecolor('black')
sc.pl.umap(br_t, color=f, ax=ax,show=False, return_fig=False, size=5, use_raw=True);

In [None]:
cts = pd.read_csv(prefix + 'ct.tsvs/acg.t.tsv', sep='\t', dtype='|S').set_index('cluster')

In [None]:
s = pd.Series(acg_t.obs['leiden'].values)

In [None]:
for ct in cts.columns:
    acg_t.obs[ct] = s.map(dict(cts[ct])).values

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6, 5))
for ax, f in zip(np.ravel(ax), ['ct2']):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_t, color=f, ax=ax,show=False, return_fig=False, size=3)
plt.tight_layout()

In [None]:
cts['ct2']