## A/C/G T/NK Cells

Here I try to phenotype the T cell and NK subsets present in the leiden clusters marked by the TNF-Alpha, Control, and IFN-Gamma conditions.

In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib.gridspec import GridSpec
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=12

In [None]:
prefix = '/data/codec/production.run/mrna/'

### Load in Data

In [None]:
path = prefix + 'pkls/aggr/tnk.pkl'
    
with open(path,'rb') as file:
    tnk = pkl.load(file)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(tnk['adata'],color=color, ax=ax, show=False, return_fig=False, size=2)

Okay, after blindly following what I had done originally for the unadjusted workflow, I've actually decided now that I would like to do filtering for only these cells, and a highly-variable gene extraction. For that reason, I'm going to extract out the cell barcodes here, and then just re-run everything from raw, and doing what I now think will be best practices.

In [None]:
tnk_cells = tnk['adata'].obs_names

In [None]:
path = prefix + 'obs/aggr/tnk.bcs.txt'

# with open(path,'w') as file:
#     for bc in tnk_cells:
#         file.write(bc + '\n'?)
        
with open(path,'r') as file:
    tnk_cells = [i.strip() for i in file.readlines()]

In [None]:
path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

### Adjust Cell Barcodes, Filter

I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

In [None]:
tnk = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[tnk_cells].copy() # I really shouldn't do this, I should go back and run cellranger aggr, but for now just concatenate

In [None]:
tnk.var['n_counts'] = tnk.var[[i for i in tnk.var.columns if i.startswith('n_counts')]].sum(axis=1)

### Filter Genes, Transform Data

Drop genes with very low counts.

In [None]:
plt.figure(figsize=(8,6))
plt.plot(range(len(tnk.var_names)), 
         tnk.var['n_counts'].sort_values(ascending=False).values, 
         color='k')
plt.grid(False)
plt.grid(True, 'both', 'both')
plt.xscale('log')
plt.yscale('log')

In [None]:
# remove any genes that are now empty
sc.pp.filter_genes(tnk, min_counts=50, inplace=True)

In [None]:
sc.pp.normalize_per_cell(tnk, counts_per_cell_after=1e6)

In [None]:
sc.pp.log1p(tnk)

In [None]:
path = '/data/codec/production.run/adts/pkls/combat/concat.combat.adts.norm.log.pkl'
with open(path,'rb') as file:
    concat_adts = pkl.load(file)
transcripts = tnk.var_names
proteins = concat_adts['adata'].var_names
adts_df = pd.DataFrame(concat_adts['adata'].X, columns=proteins, index=concat_adts['adata'].obs_names)
tnk.obs = tnk.obs.join(adts_df)

### Highly Variable Genes Extraction

In [None]:
def hv_run(adata, flavor='cell_ranger', min_mean=0.0125, min_disp=0.5, max_mean=3, bins=500):    # Extract out highly variable genes, but don't subset just yet
    '''
    Run highly variable genes and return a new adata object, if provided.
    '''
    hv_adata = adata.copy() # make a copy because I don't want to change the original just yet
    sc.pp.highly_variable_genes(hv_adata, flavor=flavor,inplace=True, 
                                min_mean=min_mean, 
                                min_disp=min_disp, 
                                max_mean=max_mean,
                                n_bins=bins)
    
    means = hv_adata.var['means'].values
    means_sorted = np.unique(np.sort(means))
    if means_sorted[0] == 0:
        mean_shift = means_sorted[1]
    elif means_sorted[0] < 0:
        mean_shift = means_sorted[1] - 2*means_sorted[0]
    else:
        mean_shift = 0
    
    disps = hv_adata.var['dispersions_norm'].values
    disps_sorted = np.unique(np.sort(disps))
    if disps_sorted[0] == 0:
        disp_shift = disps_sorted[1]
    elif disps_sorted[0] < 0:
        disp_shift = disps_sorted[1] - 2*disps_sorted[0]
    else:
        disp_shift = 0
        
    log_means = np.log10(means + mean_shift)
    log_disps = np.log10(disps + disp_shift)
    
    hv_adata.var['log_means'] = log_means
    hv_adata.var['log_disps'] = log_disps
    
    return hv_adata

def hv_plot(hv_adata, gate=None, highlight_genes=None, bw='scott'):
    '''
    Plot the means and normalized dispersions from the adata object provided. Choose to
    also plot a gate or highlight certain genes.
    '''

    
    # do what the scanpy function does, plotting normalized dispersions with means as blue dots, 
    # I don't want to plot in logspace but I don't want to use the log function because then the gates don't work
    # if they contain segments with fractional slopes (i.e. non-straight lines). To be robust to these gates, I therefore 
    # will log10 everything and just plot in linear space
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    fig = plt.figure(figsize=(19, 6))
    gs = GridSpec(2, 3, figure=fig)
    ax1 = fig.add_subplot(gs[:, 0])
    ax2 = fig.add_subplot(gs[:, 1])
    ax3 = fig.add_subplot(gs[0, 2])
    ax4 = fig.add_subplot(gs[1, 2])

#     ax1.hist2d(log_means, log_disps, bins=bins);
    ax1.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.get_yaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.grid(True, which='both',axis='both', alpha=0.4)
    ax1.set_ylabel('Log Dispersions')
    ax1.set_xlabel('Log Means')
    sns.kdeplot(log_means, log_disps, bw=bw, shade=True, shade_lowest=False, ax=ax1, color='skyblue')

    ax2.minorticks_on() # throw on the minor ticks for use with the grid, will help with subsetting
    ax2.grid(True,which='both',axis='both') # turn the grid on
    ax2.scatter(log_means, log_disps, s=0.1, c='b')
    ax2.set_ylabel('Log Dispersions')
    ax2.set_xlabel('Log Means')

#     ax3.hist(log_means, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax3.set_title('Log Means')
    sns.kdeplot(log_means, bw=bw, ax=ax3, color='blue')

#     ax4.hist(log_disps, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax4.set_title('Log Dispersions')
    sns.kdeplot(log_disps, bw=bw, ax=ax4, color='blue')

    plt.tight_layout();

    if type(highlight_genes) != type(None): # do you want to highlight some genes?
        mask = [i in highlight_genes for i in hv_adata.var_names] # create a boolean mask of which genes to highlight
        highlight_means = log_means[mask] # subset only those means
        highlight_disps = log_disps[mask] # subset only those dispersions
        ax2.scatter(highlight_means,highlight_disps,s=30, facecolors='none', edgecolors='r'); # plot with a red circle around the blue dot
        
    if type(gate) != type(None):
        # You can draw a gate around the genes you want. There should be a check for using only rectangular gates (or only polygons with right angles). 
        # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
        gatepatch = patches.Polygon(gate,linewidth=1,edgecolor='teal',facecolor='turquoise',alpha=0.5) # create a matplotlib patch for the gate to the plot
        ax2.add_patch(gatepatch); # add the gate to the plot

def hv_genes(hv_adata, gate, adata=None):
    '''
    Receive boolean for genes in gate or new subsetted adata object, if provided.
    '''
    # You can subset the genes you want using the gate. Again, there should be a check for using only rectangular gates (or only polygons with right angles). 
    # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
    genes = list()
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    pointsmap = map(Point,log_means,log_disps) # make each point a shapely.geometry.Point
    
    polygon = Polygon(gate) # make your gate a shapely.geometry.Polygon
    for i in pointsmap:
        genes.append(polygon.contains(i)) # this is the workhorse, determining which points are in the gate
    if type(adata) != type(None): # if the adata has been provided, subset it
        adata = adata[:,genes].copy()
        return adata
    else: # if it has not been provided return the boolean array noting which genes will be kept
        return genes

Generate a "highly variable adata" object that has run the highly variable genes extraction function. I generate a separate object because I don't want to necessarily change the original object.

In [None]:
hv_adata = hv_run(tnk)

In [None]:
hv_plot(hv_adata, highlight_genes= [i for i in tnk.var_names if i.startswith('MT')])

In [None]:
xrange = (-1.3, 1.2)
yrange = (0.5, 2)
gate = np.array([(xrange[0], yrange[0]), 
                  (xrange[0], yrange[1]), 
                  (xrange[1], yrange[1]), 
                  (xrange[1], yrange[0])])

In [None]:
hv_plot(hv_adata, gate=gate)

In [None]:
genes = hv_genes(hv_adata, gate=gate)

In [None]:
tnk.var_names[genes][:10].values

In [None]:
sum(genes)

You can get a new adata object with your genes now subsetted:

In [None]:
tnk = hv_genes(hv_adata, gate=gate, adata=tnk)

In [None]:
tnk.shape

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
for vals, ax in zip(['percent_mito','n_counts'], np.ravel(ax)):
    ax.hist(tnk.obs[vals].values,bins=100)

In [None]:
sc.pp.regress_out(tnk, ['percent_mito','n_counts'],n_jobs=1)

In [None]:
sc.pp.scale(tnk)

In [None]:
sc.pp.combat(tnk, key='batch',covariates=['cond','free_id'])

In [None]:
sc.pp.pca(tnk,n_comps=150)

In [None]:
sc.pl.pca_variance_ratio(tnk,log=True, n_pcs=150)

In [None]:
sc.pp.neighbors(tnk,n_neighbors=15,n_pcs=80)

In [None]:
sc.tl.umap(tnk)

In [None]:
print(1)

In [None]:
sc.tl.leiden(tnk, resolution=0.6)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(tnk,color=color, ax=ax, show=False, return_fig=False, size=2, palette=palette)

In [None]:
path = prefix + 'pkls/aggr/tnk.2.pkl'

# with open(path,'wb') as file:
#     pkl.dump(tnk, file, protocol=4)

with open(path,'rb') as file:
    tnk = pkl.load(file)

In [None]:
clusts = [6, 10]

In [None]:
sub_tnk = dict()
for clust in clusts:
    sub_tnk[str(clust)] = tnk[tnk.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_tnk['6'], resolution=0.1) # subcluster them using Leiden
sc.pl.umap(sub_tnk['6'],color='leiden', size=0.6)

In [None]:
sc.tl.leiden(sub_tnk['10'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_tnk['10'],color='leiden', size=0.6)

In [None]:
groupings = [[0, 2],
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_tnk['10'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_tnk['10'].obs['celltype'] = sub_tnk['10'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_tnk['10'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_tnk['10'].obs['leiden'] = [i.strip('ct') for i in sub_tnk['10'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_tnk['10'],color=['leiden'], size=3)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
tnk = sub_cluster_mapper(tnk, sub_tnk)

In [None]:
sc.pl.umap(tnk,color=['leiden'], size=3)

In [None]:
groupings = [[1, 3, 4, 7, 8, 11],
             [0, 5, 9],
             [2, 10]
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(tnk.obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

tnk.obs['celltype'] = tnk.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        tnk.obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
tnk.obs['leiden'] = [i.strip('ct') for i in tnk.obs['celltype'].astype('category')]

In [None]:
[i for i in tnk.obs.columns if 'CD16' in i]

In [None]:
sc.pl.umap(tnk, color=['leiden', 'CD3|CD3E','CD4|CD4','CD8|CD8A','CD56|NCAM1','CD16|FCGR3A'], size=3)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(tnk, groupby='leiden', n_genes=20, groups=['4','5'])
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(tnk, ncols=5, n_genes=20)
sc.settings.verbosity = 4

Cluster 4 is probably B/T multiplets and 5 is probably T/M multiplets.

In [None]:
acg_t = tnk[tnk.obs['leiden'] == '0'].copy()
# acg_nk = tnk[tnk.obs['leiden'] == '3'].copy()
# br_t = tnk[tnk.obs['leiden'] == '1'].copy()
# br_nk = tnk[tnk.obs['leiden'] == '7'].copy()
# p_t = tnk[tnk.obs['leiden'] == '2'].copy()
# p_nk = tnk[tnk.obs['leiden'] == '6'].copy()

# Cell Type Split

In [None]:
total_pcs = 75

In [None]:
sc.settings.n_jobs = 12
sc.pp.pca(acg_t,n_comps=total_pcs)

In [None]:
sc.pl.pca_variance_ratio(acg_t,log=True, n_pcs=total_pcs)

In [None]:
df_loadings = pd.DataFrame(acg_t.varm['PCs'], index=acg_t.var_names)
df_rankings = pd.DataFrame((-1 * df_loadings.values).argsort(0).argsort(0), index=df_loadings.index, columns=df_loadings.columns)

In [None]:
num = 50
percent_ribos = list()
for pc in range(total_pcs):
    top_genes = df_loadings[pc].sort_values(ascending=True)[:num].index
    percent_ribos.append(len([i for i in top_genes if i.startswith('RP')])/num)

In [None]:
plt.bar(range(len(percent_ribos)), percent_ribos);

In [None]:
np.array(percent_ribos > 0.35)

In [None]:
pc_keep_bool = np.array([True, False, True, True ...]) # len = total_pcs

In [None]:
adata.varm['PCs'] = adata.varm['PCs'][:,pc_keep_bool]
adata.obsm['X_pca'] = adata.obsm['X_pca'][:,pc_keep_bool]

Then, when feeding to neighbors:

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(adata,n_neighbors=15,n_pcs=pc_keep_bool.sum()) # numba warning outlined here is not to be of concern: https://github.com/lmcinnes/umap/issues/252
warnings.filterwarnings('default')

In [None]:
df_loadings[24].sort_values(ascending=True)[:num].index

In [None]:
print(['SELL' in i for i in map(lambda x: df_loadings[x].sort_values(ascending=False)[:num].index, range(40))])

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(acg_t, n_neighbors=15, n_pcs=40)
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(acg_t)

In [None]:
sc.tl.leiden(acg_t, resolution=1.2)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_t,color=color, ax=ax, show=False, return_fig=False, size=4, palette=palette)

In [None]:
fig, ax = plt.subplots(2, 7, figsize=(30,8))
for val, ax in tqdm(zip(acg_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    acg_t.obs['val'] = acg_t.obs['leiden'] == val
    ax.set_facecolor('gray')
    sc.pl.umap(acg_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,1)
ax.set_facecolor('gray')
sc.pl.umap(acg_t, color=['percent_mito'], ax=ax)

In [None]:
clusts = [5, 6, 7, 9, 11, 12]

In [None]:
sub_acgt = dict()
for clust in clusts:
    sub_acgt[str(clust)] = acg_t[acg_t.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_acgt['5'], resolution=0.7) # subcluster them using Leiden
sc.pl.umap(sub_acgt['5'],color='leiden', size=5)

In [None]:
groupings = [[1, 2, 3]
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_acgt['5'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_acgt['5'].obs['celltype'] = sub_acgt['5'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgt['5'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_acgt['5'].obs['leiden'] = [i.strip('ct') for i in sub_acgt['5'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_acgt['5'], color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['6'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_acgt['6'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['7'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['7'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['9'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['9'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['11'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['11'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['12'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_acgt['12'],color='leiden', size=5)

In [None]:
groupings = [[0, 2, 4]
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_acgt['12'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_acgt['12'].obs['celltype'] = sub_acgt['12'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgt['12'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_acgt['12'].obs['leiden'] = [i.strip('ct') for i in sub_acgt['12'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_acgt['12'], color='leiden', size=5)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_t = sub_cluster_mapper(acg_t, sub_acgt)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_t,color=color, ax=ax, show=False, return_fig=False, size=4, palette=palette)

In [None]:
fig, ax = plt.subplots(4, 6, figsize=(30,15))
for val, ax in tqdm(zip(acg_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    acg_t.obs['val'] = acg_t.obs['leiden'] == val
    ax.set_facecolor('gray')
    sc.pl.umap(acg_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
groupings = [[2, 3],
             [7, 12],
             [21, 22]
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(acg_t.obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

acg_t.obs['celltype'] = acg_t.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_t.obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
acg_t.obs['leiden'] = [i.strip('ct') for i in acg_t.obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(acg_t, color='leiden', size=1)

In [None]:
fig, ax = plt.subplots(4, 5, figsize=(30,20))
for val, ax in tqdm(zip(acg_t.obs['leiden'].dtype.categories, np.ravel(ax))):
    acg_t.obs['val'] = acg_t.obs['leiden'] == val
    ax.set_facecolor('gray')
    sc.pl.umap(acg_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
acg_t = acg_t[~acg_t.obs['leiden'].isin(['2','10','12']),:].copy()

In [None]:
# renumber so that the colors match what I usually expect the numbers to be :)
clusts = acg_t.obs['leiden'].dtype.categories
for clust, i in zip(clusts, range(len(clusts))):
    acg_t.obs['leiden'].replace(clust, 'ct%s' % str(i), inplace=True)
acg_t.obs['leiden'] = [i.strip('ct') for i in acg_t.obs['leiden'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
ax.set_facecolor('black')
sc.pl.umap(acg_t, color='leiden', size=10, ax=ax)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=20)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
sc.pl.umap(acg_t, color=['CD4|CD4', 'CD8|CD8A'], size=2, ncols=5)

In [None]:
transcripts = acg_t.var_names
proteins = [i for i in acg_t.obs.columns if '|' in i]

In [None]:
name = 'SELL'
print([i for i in proteins if name in i])
print([i for i in transcripts if name in i])

In [None]:
focus_clusts = list(map(str,[0, 2, 3, 4, 5, 6]))
clusts = acg_t.obs['leiden'].dtype.categories
clust_bool = [i in focus_clusts for i in clusts]
colors = np.array(acg_t.uns['leiden_colors'])

In [None]:
features = ['CD3|CD3E', 'CD8|CD8A', 'CD4|CD4','CD8A', 'CD4', 
            'CD183|CXCR3', 'CXCR3', 'STAT1', 'STAT2', 'STAT4', 
            'GATA3', 'CD194|CCR4', 'CCR4', 'CD196|CCR6', 'CCR6',
            'CD185|CXCR5', 'CXCR5', 'SMAD2', 'SMAD3', 'SMAD7'
            ]

In [None]:
fig, ax = plt.subplots(3, 5, figsize=(20,10))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=12):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        sns.kdeplot(np.array(acg_t[acg_t.obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, alpha=0.5, ax=ax)
    ax.set_title(f)
plt.tight_layout()

In [None]:
features = ['CD45RO|PTPRC', 'CD45RA|PTPRC', 'CD25|IL2RA', 'CD127|IL7R', 
            'IL7R', 'CD38|CD38', 'CD38', 'SELL', 'CD62L|SELL', 
            'CCR7', 'CD197|CCR7', 'CD69', 'CD69|CD69', 'CD27', 'CD27|CD27']

In [None]:
fig, ax = plt.subplots(3, 5, figsize=(20,10))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=12):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        sns.kdeplot(np.array(acg_t[acg_t.obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, alpha=0.5, ax=ax)
    ax.set_title(f)
plt.tight_layout()

In [None]:
features = ['HLA-ABC|HLA-A_B_C', 'HHLA3', 'HLA-F', 'HLA-G', 'HLA-A', 
            'HLA-E', 'HLA-C', 'HLA-B', 'HLA-DRA', 'HLA-DRB5', 
            'HLA-DRB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DMB', 'HLA-DMA', 
            'HLA-DOA', 'HLA-DPA1', 'HLA-DPB1']

In [None]:
fig, ax = plt.subplots(3, 6, figsize=(20,8))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=18):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        sns.kdeplot(np.array(acg_t[acg_t.obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, alpha=0.5, ax=ax)
    ax.set_title(f)
plt.tight_layout()

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=1000, groups=['16'])
warnings.filterwarnings('default')
# sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=500)
sc.settings.verbosity = 4

In [None]:
def get_dge(adata):
    deg_data = pd.DataFrame() 
    for i in ['scores', 'names', 'logfoldchanges', 'pvals', 'pvals_adj']:
        deg_data[i] = np.array(adata.uns['rank_genes_groups'][i].tolist()).flatten()
    return deg_data

In [None]:
deg_data = get_dge(acg_t)

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1)
plt.scatter(deg_data[deg_data['names'].str.contains('RP')]['logfoldchanges'].values, 
            -np.log10(deg_data[deg_data['names'].str.contains('RP')]['pvals_adj'].values), s=1, color='r');

In [None]:
deg_data[(deg_data['logfoldchanges'] > 2.5) & (-np.log10(deg_data['pvals_adj']) > 10)]['names'].values

In [None]:
sc.pl.umap(acg_t, color=['BTG1'], size=2, ncols=5)

In [None]:
def grouped_rank(adata, groups, return_uns=False):

    assert len(groups) == 2
    for i in range(2):
        groups[i] = np.array(groups[i]).astype(str)
        
    grouped_clusts = [i for j in groups for i in j]
    numclusts = np.unique(adata.obs['leiden'].values)
    groups.append(np.setdiff1d(numclusts, grouped_clusts).astype('<U21'))
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    for clusts, title, ax in zip(groups, ['0','1'], axes):
        adata.obs['val'] = adata.obs['leiden'].isin(clusts).values
        ax.set_facecolor('gray')
        sc.pl.umap(adata,color='val', ax=ax, size=1, show=False, return_fig=False, title=title)
    adata.obs.drop(columns='val', inplace=True)

    adata.obs['rank_compare'] = adata.obs['leiden'].copy()
    groups_dict = dict(zip(range(3), groups))
    for group in groups_dict:
        for clust in groups_dict[group]:
            adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)
    adata.obs['rank_compare'] = adata.obs['rank_compare'].str.replace('ct','').astype('category')
    sc.settings.verbosity = 0
    warnings.filterwarnings('ignore')
    sc.tl.rank_genes_groups(adata, groupby='rank_compare', n_genes=1000, groups=['0'], reference='1', test='wilcoxon')
    warnings.filterwarnings('default')
    sc.pl.rank_genes_groups(adata, ncols=5, n_genes=20)
    sc.settings.verbosity = 4
    adata.obs.drop(columns='rank_compare', inplace=True)
    return

In [None]:
grouped_rank(acg_t,groups=[[0], [8, 2, 3, 9, 10, 16]])

Some packages from [goatools](https://github.com/tanghaibao/goatools) Gene Ontology package for Python. Most of the code here modeled after the [notebooks](https://github.com/tanghaibao/goatools#ipython-notebooks) found on the Github.

In [None]:
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.test_data.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_hum
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [None]:
from IPython.display import display # need this to print out the results without truncation.

### Gene Ontology Analysis

In [None]:
obo_fname = download_go_basic_obo(prefix + "go-basic.obo")
fin_gene2go = download_ncbi_associations(prefix + 'gene2go')
obodag = GODag(prefix + "go-basic.obo")
# Read NCBI's gene2go. Store annotations in a list of named tuples
objanno = Gene2GoReader(fin_gene2go, taxids=[9606])
# Get namespace2association where:
#    namespace is:
#        BP: biological_process               
#        MF: molecular_function
#        CC: cellular_component
#    assocation is a dict:
#        key: NCBI GeneID
#        value: A set of GO IDs associated with that gene
ns2assoc = objanno.get_ns2assc()
for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
gene_ids = list(GeneID2nt_hum.keys())
symbol2geneid = dict(zip([GeneID2nt_hum[i].Symbol for i in gene_ids], [i for i in gene_ids]))

Next cell produces a lot of output, recommend collapsing it in notebook.

In [None]:
top_genes = [i[0] for i in acg_t.uns['rank_genes_groups']['names']][:25]

In [None]:
results = list()
ids = list()
for i in top_genes:
    try:
        ids.append(symbol2geneid[i])
    except:
        continue
print(len(ids)/len(top_genes))

goeaobj = GOEnrichmentStudyNS(
        GeneID2nt_hum.keys(), # List of human protein-coding genes
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.05, # default significance cut-off
        methods = ['fdr_bh']) # defult multipletest correction method

goea_results_all = goeaobj.run_study(ids)
for i in goea_results_all:
    results.append([i.name, i.NS, -np.log10(i.p_fdr_bh)])

In [None]:
results_df = pd.DataFrame(results, columns=['GO_term','GO_type','-log10padj'])

In [None]:
df = results_df[(results_df['-log10padj'] > -np.log10(0.05))
               ][['GO_type', 'GO_term', '-log10padj']].sort_values(['GO_type', '-log10padj'], ascending=False)
with pd.option_context('display.max_rows', 120, 'display.max_columns', 10, 'display.max_colwidth', -1):
    display(df)

In [None]:
sc.pl.umap(acg_t, color=['SELL'], size=2, ncols=5)

In [None]:
def grouped_rank(adata, groups):

    assert len(groups) == 2
    for i in range(2):
        groups[i] = np.array(groups[i]).astype(str)
        
    grouped_clusts = [i for j in groups for i in j]
    numclusts = np.unique(adata.obs['leiden'].values)
    groups.append(np.setdiff1d(numclusts, grouped_clusts).astype('<U21'))
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))
    for clusts, title, ax in zip(groups, ['0','1'], axes):
        adata.obs['val'] = adata.obs['leiden'].isin(clusts).values
        ax.set_facecolor('gray')
        sc.pl.umap(adata,color='val', ax=ax, size=1, show=False, return_fig=False, title=title)
    adata.obs.drop(columns='val', inplace=True)

    adata.obs['rank_compare'] = adata.obs['leiden'].copy()
    groups_dict = dict(zip(range(3), groups))
    for group in groups_dict:
        for clust in groups_dict[group]:
            adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)
    adata.obs['rank_compare'] = adata.obs['rank_compare'].str.replace('ct','').astype('category')
    sc.settings.verbosity = 0
    warnings.filterwarnings('ignore')
    sc.tl.rank_genes_groups(adata, groupby='rank_compare', n_genes=1000, groups=['0'], reference='1', test='wilcoxon')
    warnings.filterwarnings('default')
    sc.pl.rank_genes_groups(adata, ncols=5, n_genes=20)
    sc.settings.verbosity = 4
    adata.obs.drop(columns='rank_compare', inplace=True)
    return

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_t)

In [None]:
def get_dge(adata):
    deg_data = pd.DataFrame() 
    for i in ['scores', 'names', 'logfoldchanges', 'pvals', 'pvals_adj']:
        deg_data[i] = np.array(adata.uns['rank_genes_groups'][i].tolist()).flatten()
    return deg_data

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=1000, groups=['15'], reference='0')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
deg_data = get_dge(acg_t)

In [None]:
deg_data = deg_data[deg_data['pvals_adj'] != 0]

In [None]:
-np.log10(0.05)

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1);
plt.scatter(deg_data[deg_data['names'].str.contains('RP')]['logfoldchanges'].values, 
            -np.log10(deg_data[deg_data['names'].str.contains('RP')]['pvals_adj'].values), s=1, color='r');
# plt.yscale('log')
# plt.xscale('log')

In [None]:
sc.pl.umap(acg_t, color='XIST', s=3, use_raw=False)

In [None]:
path = prefix + 'pkls/aggr/acg_t.pkl'

# with open(path,'wb') as file:
#     pkl.dump(acg_t, file, protocol=4)

with open(path,'rb') as file:
    acg_t = pkl.load(file)

In [None]:
grouped_rank(acg_t,groups=[[1, 9], [2, 3]])

In [None]:
def get_dge(adata):
    deg_data = pd.DataFrame() 
    for i in ['scores', 'names', 'logfoldchanges', 'pvals', 'pvals_adj']:
        deg_data[i] = np.array(adata.uns['rank_genes_groups'][i].tolist()).flatten()
    return deg_data

In [None]:
deg_data = get_dge(acg_t)

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1)
plt.scatter(deg_data[deg_data['names'].str.contains('MT')]['logfoldchanges'].values, 
            -np.log10(deg_data[deg_data['names'].str.contains('MT')]['pvals_adj'].values), s=1, color='r');

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1)

In [None]:
sc.pl.umap(acg_t, color=deg_data[(deg_data['logfoldchanges'] > 1) & (-np.log10(deg_data['pvals_adj']) > 150)]['names'].values, s=3, use_raw=False)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=1000, groups=['3'], reference='2')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
deg_data = get_dge(acg_t)

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1)
plt.scatter(deg_data[deg_data['names'].str.contains('RP')]['logfoldchanges'].values, 
            -np.log10(deg_data[deg_data['names'].str.contains('RP')]['pvals_adj'].values), s=1, color='r');

In [None]:
sc.pl.umap(acg_t, color=deg_data[(deg_data['logfoldchanges'] > 1) & (-np.log10(deg_data['pvals_adj']) > 25)]['names'].values, s=3, use_raw=False)

In [None]:
for group in groups_dict:
    for clust in groups_dict[group]:
        adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)

In [None]:
adata.obs['rank_compare']

In [None]:
adata.obs['rank_compare'] = adata.obs['leiden'].copy()
groups_dict = dict(zip(range(3), groups))
for group in group_dicts:
    for clust in group_dicts[group]:
        adata.obs['rank_compare'].replace(clust, str(group), inplace=True)

In [None]:
groups_dict

In [None]:
adata.obs['rank_compare'].replace()

In [None]:
grouped_rank(acg_t,groups=[[14], [0, 1, 2, 3, 7, 9, 10, 11, 13]])

In [None]:
sc.pl.umap(acg_t, color=['S100A4','CYBA','KLRB1','ARID5B', 'FXYD5'], size=3, ncols=5)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=20, groups=['3'], reference='2')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=20, groups=['2'], reference='3')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
val = '20'
tnk.obs['val'] = tnk.obs['leiden'] == val
fig, ax = plt.subplots(1,1)
ax.set_facecolor('gray')
sc.pl.umap(tnk,color='val', ax=ax)
tnk.obs.drop(columns='val', inplace=True)

In [None]:
sc.pl.umap(tnk,color='FOXP3')

In [None]:
batches = tnk.obs['batch'].unique() # get a list of the batches

# make a new column in the `.obs` for each batch that is of dtype `int` and that just takes on values of 0 and 1 so that it gets plotted as a continuous variable instead of a categorical one
for batch in batches:
    tnk.obs['batch_%s' % batch] = (tnk.obs['batch'] == batch).astype(int)

# plot now with sort_order=True, which should apply to this new continuous variable
sc.pl.umap(tnk, color=['batch_%s' % i for i in batches],sort_order=True, ncols=4)

In [None]:
sc.tl.leiden(sub_tnk['2'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_tnk['2'],color='leiden', size=0.6)

In [None]:
sc.tl.leiden(sub_tnk['3'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_tnk['3'],color='leiden', size=0.6)

In [None]:
fig, ax = plt.subplots(1,1)
ax.set_facecolor('gray')
sc.pl.umap(sub_tnk['3'], color='FOXP3', size=2, ax=ax)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(sub_tnk['3'], groupby='leiden', n_genes=50, groups=['2'], reference='0')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(sub_tnk['3'], ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
ctdict = dict()
ctdict['ct0'] = [0, 1, 2, 3, 4, 6]
ctdict['ct1'] = [5]
sub_tnk['4'].obs['sub_ct'] = sub_tnk['4'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_tnk['4'].obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
sub_tnk['4'].obs['leiden'] = [i.strip('ct') for i in sub_tnk['4'].obs['sub_ct'].astype('category')]

In [None]:
tnk['adata'] = sub_cluster_mapper(tnk['adata'], sub_tnk)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(tnk['adata'],color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
acg_lo_mito = [1, 3, 4, 5, 15, 16, 17]
br_lo_mito = [0, 8, 6, 13, 18, 20, 24]
p_all = [2, 14, 19]
hi_mito = [7, 9, 10, 11, 21]
other = [12, 22, 23]

In [None]:
ctdict = dict()
ctdict['acg_lo'] = acg_lo_mito
ctdict['br_lo'] = br_lo_mito
ctdict['p_all'] = p_all
ctdict['hi_m'] = hi_mito
ctdict['other'] = other
tnk['adata'].obs['ct2'] = tnk['adata'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        tnk['adata'].obs['ct2'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
tnk['adata'].obs['ct2'] = tnk['adata'].obs['ct2'].astype('category')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
sc.pl.umap(tnk['adata'],color='ct2', ax=ax, show=False, return_fig=False, size=2);

### Extract Out Cell Barcodes

I want to take a look at the control and high mito populations in a separate notebook and see if I can adjust the data so that they cluster together.

In [None]:
path = prefix + 'acg.tnk.and.high.mito.bcs.txt'

with open(path,'w') as file:
    for bc in tnk['adata'][(tnk['adata'].obs['ct2'] == 'acg_lo') | 
                           (tnk['adata'].obs['ct2'] == 'hi_m')].obs_names:
        file.write(bc + '\n')

### A/C/G, Low Mito

Lets start from the top left. Clusters marked by cells stimulated with TNF-alpha and IFN-gamma, which have very nice overlap with our Control cells, indicating probably no major effect on these populations.

In [None]:
acg = tnk['adata'][tnk['adata'].obs['ct2'] == 'acg_lo'].copy()

# choosing to use true raw counts here - if I don't end up getting a bunch of house keeping genes coming as differentially expressed, not ideal
acg.raw = tnk['raw'][tnk['adata'].obs['ct2'] == 'acg_lo'].copy()
# acg.raw = sc.pp.log1p(sc.pp.normalize_per_cell(tnk['raw'][tnk['adata'].obs['ct2'] == 'acg_lo'], copy=True), copy=True)

In [None]:
def acg_umap(acg, color, ax='None', **kwargs):
    if ax == 'None':
        fig, ax = plt.subplots(1, 1, figsize=(8,8))
    sc.pl.umap(acg,color=color, ax=ax, show=False, return_fig=False, size=4, use_raw=False, **kwargs);
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    ax.set_xlim((xlim[0], 0))
    ax.set_ylim((-5, ylim[1]));
    return

In [None]:
acg_umap(acg,'exp_id')

In [None]:
sc.tl.leiden(acg,resolution=0.5)

In [None]:
acg_umap(acg,color='leiden')

In [None]:
ctdict = dict()
ctdict['ct0'] = [0]
ctdict['ct1'] = [1]
ctdict['ct2'] = [2, 5, 8]
ctdict['ct3'] = [3]
ctdict['ct4'] = [4]
ctdict['ct5'] = [6]
ctdict['ct6'] = [7]
acg.obs['sub_ct'] = acg.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg.obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg.obs['leiden'] = [i.strip('ct') for i in acg.obs['sub_ct'].astype('category')]

In [None]:
acg_umap(acg,color='leiden')

In [None]:
sub_acgs = dict() # put the new subsetted acg objects in a dictionary of acgs

In [None]:
sub_acgs['1'] = acg[acg.obs['leiden'] == '1'].copy()
sub_acgs['2'] = acg[acg.obs['leiden'] == '2'].copy()
sub_acgs['3'] = acg[acg.obs['leiden'] == '3'].copy()
sub_acgs['4'] = acg[acg.obs['leiden'] == '4'].copy()
sub_acgs['5'] = acg[acg.obs['leiden'] == '5'].copy()
sub_acgs['6'] = acg[acg.obs['leiden'] == '6'].copy()

In [None]:
sc.tl.leiden(sub_acgs['1'], resolution=0.6) # subcluster them using Leiden
acg_umap(sub_acgs['1'],color='leiden')

In [None]:
ctdict = dict()
ctdict['ct0'] = [0, 3, 4, 5, 6, 7]
ctdict['ct1'] = [1, 2]
sub_acgs['1'].obs['sub_ct'] = sub_acgs['1'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgs['1'].obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
sub_acgs['1'].obs['leiden'] = [i.strip('ct') for i in sub_acgs['1'].obs['sub_ct'].astype('category')]

In [None]:
acg_umap(sub_acgs['1'],color='leiden')

In [None]:
sc.tl.leiden(sub_acgs['2'], resolution=0.4) # subcluster them using Leiden
acg_umap(sub_acgs['2'],color='leiden')

In [None]:
ctdict = dict()
ctdict['ct0'] = [3]
ctdict['ct1'] = [0, 1, 2, 4]
sub_acgs['2'].obs['sub_ct'] = sub_acgs['2'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgs['2'].obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
sub_acgs['2'].obs['leiden'] = [i.strip('ct') for i in sub_acgs['2'].obs['sub_ct'].astype('category')]

In [None]:
acg_umap(sub_acgs['2'],color='leiden')

In [None]:
sc.tl.leiden(sub_acgs['3'], resolution=0.8) # subcluster them using Leiden
acg_umap(sub_acgs['3'],color='leiden')

In [None]:
ctdict = dict()
ctdict['ct0'] = [0, 1]
ctdict['ct1'] = [2, 4]
ctdict['ct2'] = [3, 5]
sub_acgs['3'].obs['sub_ct'] = sub_acgs['3'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgs['3'].obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
sub_acgs['3'].obs['leiden'] = [i.strip('ct') for i in sub_acgs['3'].obs['sub_ct'].astype('category')]

In [None]:
acg_umap(sub_acgs['3'],color='leiden')

In [None]:
sc.tl.leiden(sub_acgs['4'], resolution=0.2) # subcluster them using Leiden
acg_umap(sub_acgs['4'],color='leiden')

In [None]:
sc.tl.leiden(sub_acgs['5'], resolution=0.2) # subcluster them using Leiden
acg_umap(sub_acgs['5'],color='leiden')

In [None]:
ctdict = dict()
ctdict['ct0'] = [0]
ctdict['ct1'] = [1, 2]
sub_acgs['5'].obs['sub_ct'] = sub_acgs['5'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgs['5'].obs['sub_ct'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
sub_acgs['5'].obs['leiden'] = [i.strip('ct') for i in sub_acgs['5'].obs['sub_ct'].astype('category')]

In [None]:
acg_umap(sub_acgs['5'],color='leiden')

In [None]:
sc.tl.leiden(sub_acgs['6'], resolution=0.2) # subcluster them using Leiden
acg_umap(sub_acgs['6'],color='leiden')

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg = sub_cluster_mapper(acg, sub_acgs)

In [None]:
acg.obs.drop(columns=['sub_ct'], inplace=True)

In [None]:
acg_umap(acg,color='leiden', legend_loc='on data', palette=sc.pl.palettes.vega_20_scanpy)

In [None]:
all_features = acg.var_names.tolist() + [i for i in acg.obs.columns if '|' in i]

In [None]:
def check_names(list_of_strings):
    if type(list_of_strings) != list:
        raise
    for f in all_features:
        for j in list_of_strings:
            if j in f:
                print(f)
    return

In [None]:
features = ['CD16|FCGR3A', 'CD56|NCAM1', 'FCGR3A', 'CD94|KLRD1', 'KLRB1', 'NKG7', 'GNLY', 'GZMB']
fig, ax = plt.subplots(2, 4, figsize=(30,14))
for f, ax in tqdm(zip(features, np.ravel(ax))):
    acg_umap(acg,color=f, ax=ax)

In [None]:
features = ['CD3|CD3E', 'CD3E', 'CD3D', 'CD8|CD8A', 'CD8A', 'CD8B', 'CD4', 'CD4|CD4']
fig, ax = plt.subplots(2, 4, figsize=(30,14))
for f, ax in tqdm(zip(features, np.ravel(ax))):
    acg_umap(acg,color=f, ax=ax)

In [None]:
acg_sub_cts = ['cd8', 'cd4', 'cd3dn', 'nk']

In [None]:
ctdict = dict()
ctdict['cd8'] = [3, 5, 6, 9, 10, 12]
ctdict['cd4'] = [0, 1, 2, 7, 8]
ctdict['cd3dn'] = [13]
ctdict['nk'] = [4, 11]
acg.obs['ct3'] = acg.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg.obs['ct3'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg.obs['ct3'] = acg.obs['ct3'].astype('category')

In [None]:
acg_umap(acg,color='ct3')

In [None]:
acg_subs = dict()
for ct in acg_sub_cts:
    acg_subs[ct] = acg[acg.obs['ct3'] == ct].copy()

### Phenotyping Subsets

Using [this paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4810120/) for reference for T cell subsets.

#### CD4

In [None]:
acg_umap(acg_subs['cd4'],color='leiden')

In [None]:
check_names(['CTLA'])

In [None]:
features = ['CD45RO|PTPRC', 'CD45RA|PTPRC', 'IL2RA', 'CD25|IL2RA', 'CD127|IL7R', 'IL7R', 'CTLA4', 'CD152|CTLA4']
fig, ax = plt.subplots(2, 4, figsize=(30,14))
for f, ax in tqdm(zip(features, np.ravel(ax))):
    acg_umap(acg_subs['cd4'], color=f, ax=ax)

In [None]:
focus_clusts = list(map(str,[0, 1, 7, 8]))
clusts = acg_subs['cd4'].obs['leiden'].dtype.categories
clust_bool = [i in focus_clusts for i in clusts]
colors = np.array(acg_subs['cd4'].uns['leiden_colors'])

In [None]:
features = ['CD45RO|PTPRC', 'CD45RA|PTPRC', 'IL2RA', 'CD25|IL2RA', 'CD127|IL7R', 'IL7R', 'CTLA4', 'CD152|CTLA4', 'SELL', 'CD62L|SELL', 'CCR7', 'CD197|CCR7']
log_f = ['IL2RA', 'CTLA4']

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(20,12))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=12):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        ax.hist(np.array(acg_subs['cd4'][acg_subs['cd4'].obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, bins=100, alpha=0.7)
        if f in log_f:
            ax.set_yscale('log')
    ax.set_title(f)

In [None]:
conds = acg_subs['cd4'].obs['cond'].dtype.categories

In [None]:
cond_leiden_df = pd.DataFrame(index=conds, columns=clusts, dtype=np.int)

In [None]:
for cond in tqdm(conds):
    for clust in clusts:
        cond_leiden_df.loc[cond, clust] = acg_subs['cd4'][(acg_subs['cd4'].obs['cond'] == cond) & (acg_subs['cd4'].obs['leiden'] == clust)].shape[0]

In [None]:
cond_leiden_df = cond_leiden_df.divide(cond_leiden_df.sum(0),1)

In [None]:
labels = clusts
props = dict()

for cond in conds:
    props[cond] = cond_leiden_df.loc[cond].values

fig, ax = plt.subplots(figsize=(10,5))
lastpos = [0]*len(clusts)
for cond in props:
    ax.bar(labels, props[cond], label=cond, bottom=lastpos)
    lastpos = props[cond] + lastpos

ax.set_ylabel('Proportion')
ax.legend()

plt.show()

In [None]:
ctdict = dict()
ctdict['N/R_1'] = [0]
ctdict['N/R_2'] = [8]
ctdict['Eff_1'] = [1]
ctdict['Eff_2'] = [7]
ctdict['Other'] = [2]
acg_subs['cd4'].obs['ct4'] = acg_subs['cd4'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_subs['cd4'].obs['ct4'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg_subs['cd4'].obs['ct4'] = acg_subs['cd4'].obs['ct4'].astype('category')

In [None]:
acg_umap(acg_subs['cd4'], color='ct4')

### CD8

In [None]:
acg_umap(acg_subs['cd8'],color='leiden')

In [None]:
features = ['CD45RO|PTPRC', 'CD45RA|PTPRC', 'IL2RA', 'CD25|IL2RA', 'CD127|IL7R', 'IL7R', 'CTLA4', 'CD152|CTLA4']
fig, ax = plt.subplots(2, 4, figsize=(30,14))
for f, ax in tqdm(zip(features, np.ravel(ax))):
    acg_umap(acg_subs['cd8'], color=f, ax=ax)

In [None]:
focus_clusts = list(map(str,[3, 5, 6, 9]))
clusts = acg_subs['cd8'].obs['leiden'].dtype.categories
clust_bool = [i in focus_clusts for i in clusts]
colors = np.array(acg_subs['cd8'].uns['leiden_colors'])

In [None]:
features = ['CD45RO|PTPRC', 'CD45RA|PTPRC', 'IL2RA', 'CD25|IL2RA', 'CD127|IL7R', 'IL7R', 'CTLA4', 'CD152|CTLA4', 'SELL', 'CD62L|SELL', 'CCR7', 'CD197|CCR7']
log_f = ['IL2RA', 'CTLA4', 'SELL', 'CCR7']

In [None]:
fig, ax = plt.subplots(3, 4, figsize=(20,12))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=12):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        ax.hist(np.array(acg_subs['cd8'][acg_subs['cd8'].obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, bins=100, alpha=0.5)
        if f in log_f:
            ax.set_yscale('log')
    ax.set_title(f)

In [None]:
conds = acg_subs['cd8'].obs['cond'].dtype.categories

In [None]:
cond_leiden_df = pd.DataFrame(index=conds, columns=clusts, dtype=np.int)

In [None]:
for cond in tqdm(conds):
    for clust in clusts:
        cond_leiden_df.loc[cond, clust] = acg_subs['cd8'][(acg_subs['cd8'].obs['cond'] == cond) & (acg_subs['cd8'].obs['leiden'] == clust)].shape[0]

In [None]:
cond_leiden_df = cond_leiden_df.divide(cond_leiden_df.sum(0),1)

In [None]:
labels = clusts
props = dict()

for cond in conds:
    props[cond] = cond_leiden_df.loc[cond].values

fig, ax = plt.subplots(figsize=(10,5))
lastpos = [0]*len(clusts)
for cond in props:
    ax.bar(labels, props[cond], label=cond, bottom=lastpos)
    lastpos = props[cond] + lastpos

ax.set_ylabel('Proportion')
ax.legend()

plt.show()

In [None]:
ctdict = dict()
ctdict['N/R_1'] = [3]
ctdict['N/R_2'] = [9]
ctdict['Eff_Mem'] = [6]
ctdict['Eff'] = [5]
ctdict['Other'] = [10, 12]
acg_subs['cd8'].obs['ct4'] = acg_subs['cd8'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_subs['cd8'].obs['ct4'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg_subs['cd8'].obs['ct4'] = acg_subs['cd8'].obs['ct4'].astype('category')

In [None]:
acg_umap(acg_subs['cd8'], color='ct4')

### NK

In [None]:
acg_umap(acg_subs['nk'], color='leiden')

In [None]:
focus_clusts = list(map(str,[4, 11]))
clusts = acg_subs['nk'].obs['leiden'].dtype.categories
clust_bool = [i in focus_clusts for i in clusts]
colors = np.array(acg_subs['nk'].uns['leiden_colors'])

In [None]:
features = ['FCGR3A', 'CD16|FCGR3A', 'NCAM1', 'CD56|NCAM1']
log_f = ['FCGR3A','NCAM1']

In [None]:
fig, ax = plt.subplots(1, 4, figsize=(20,4))
for f, ax in tqdm(zip(features, np.ravel(ax)), total=4):
    for clust, color in zip(focus_clusts, colors[clust_bool]):
        ax.hist(np.array(acg_subs['nk'][acg_subs['nk'].obs['leiden'] == clust].obs_vector(f)), # need the np.array wrapper because for the genes it returns an "Array View"
                color=color, bins=100, alpha=0.5)
        if f in log_f:
            ax.set_yscale('log')
    ax.set_title(f)

In [None]:
ctdict = dict()
ctdict['CD56dim'] = [4]
ctdict['CD56bright'] = [11]
acg_subs['nk'].obs['ct4'] = acg_subs['nk'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_subs['nk'].obs['ct4'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg_subs['nk'].obs['ct4'] = acg_subs['nk'].obs['ct4'].astype('category')

In [None]:
acg_umap(acg_subs['nk'],color='ct4')

#### CD3 Double-Negative

In [None]:
acg_umap(acg_subs['cd3dn'], color='leiden')

In [None]:
features = ['CD3|CD3E', 'CD3E', 'CD8|CD8A', 'CD4|CD4', 'CD94|KLRD1', 'KLRB1', 'NKG7', 'GNLY', 'GZMB']
fig, ax = plt.subplots(2, 4, figsize=(30,14))
for f, ax in tqdm(zip(features, np.ravel(ax))):
    acg_umap(acg,color=f, ax=ax)

In [None]:
ctdict = dict()
ctdict['NKT'] = [13]
acg_subs['cd3dn'].obs['ct4'] = acg_subs['cd3dn'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_subs['cd3dn'].obs['ct4'].replace(r'^%s$' % str(clust), ct, regex=True, inplace=True)
acg_subs['cd3dn'].obs['ct4'] = acg_subs['cd3dn'].obs['ct4'].astype('category')

In [None]:
acg_umap(acg_subs['cd3dn'], color='ct4')

In [None]:
dict(zip(acg_subs.keys(), map(lambda x: acg_subs[x].obs['ct4'], acg_subs.keys())))